diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..821c9f5c
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-08-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03742v1","updated":"2023-08-07T17:46:49Z","published":"2023-08-07T17:46:49Z","title":"What about translation? New coding system for content analysis on the\n  perception of literary translation around the political transformation in\n  1989 in Hungary as a classification problem on an unbalanced dataset","summary":"  To track trends in the perception of literary translation around the\npolitical transformation in 1989 in Hungary, a coding system was developed on\nthe paragraphs of the 1980-1999 issues of the literary journal Alf\\\"old. This\npaper describes how we trained BERT models to carry over the coding system to\nthe 1980-1999 issues of the literary journal Nagyvil\\'ag. We use extensive\nhyperparameter tuning, loss functions robust to label unbalance, 10-fold\ncross-validation for precise evaluations and a model ensemble for prediction,\nmanual validation on the predict set, a new calibration method to better\npredict label counts for sections of the Nagyvil\\'ag corpus, and to study the\nrelations between labels, we construct label relation networks.\n","authors":["Dalma Galambos","Pál Zsámboki"],"pdf_url":"https://arxiv.org/pdf/2308.03742v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2301.09656v3","updated":"2023-08-07T17:40:40Z","published":"2023-01-23T19:00:02Z","title":"Selective Explanations: Leveraging Human Input to Align Explainable AI","summary":"  While a vast collection of explainable AI (XAI) algorithms have been\ndeveloped in recent years, they are often criticized for significant gaps with\nhow humans produce and consume explanations. As a result, current XAI\ntechniques are often found to be hard to use and lack effectiveness. In this\nwork, we attempt to close these gaps by making AI explanations selective -- a\nfundamental property of human explanations -- by selectively presenting a\nsubset from a large set of model reasons based on what aligns with the\nrecipient's preferences. We propose a general framework for generating\nselective explanations by leveraging human input on a small sample. This\nframework opens up a rich design space that accounts for different selectivity\ngoals, types of input, and more. As a showcase, we use a decision-support task\nto explore selective explanations based on what the decision-maker would\nconsider relevant to the decision task. We conducted two experimental studies\nto examine three out of a broader possible set of paradigms based on our\nproposed framework: in Study 1, we ask the participants to provide their own\ninput to generate selective explanations, with either open-ended or\ncritique-based input. In Study 2, we show participants selective explanations\nbased on input from a panel of similar users (annotators). Our experiments\ndemonstrate the promise of selective explanations in reducing over-reliance on\nAI and improving decision outcomes and subjective perceptions of the AI, but\nalso paint a nuanced picture that attributes some of these positive effects to\nthe opportunity to provide one's own input to augment AI explanations. Overall,\nour work proposes a novel XAI framework inspired by human communication\nbehaviors and demonstrates its potentials to encourage future work to better\nalign AI explanations with human production and consumption of explanations.\n","authors":["Vivian Lai","Yiming Zhang","Chacha Chen","Q. Vera Liao","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2301.09656v3.pdf","comment":"21 pages, 25 figures"},{"id":"http://arxiv.org/abs/2307.14361v2","updated":"2023-08-07T17:09:07Z","published":"2023-07-24T21:01:46Z","title":"A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer\n  using LSTM, BiLSTM, CNN, GRU, and GloVe","summary":"  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and\nGloVe to classify gene mutations using Kaggle's Personalized Medicine:\nRedefining Cancer Treatment dataset. The results were compared against\nwell-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and\ntheir LSTM ensembles. Our model outperformed all other models in terms of\naccuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it\nalso needed less training time, resulting in a perfect combination of\nperformance and efficiency. This study demonstrates the utility of ensemble\nmodels for difficult tasks such as gene mutation classification.\n","authors":["Sanad Aburass","Osama Dorgham","Jamil Al Shaqsi"],"pdf_url":"https://arxiv.org/pdf/2307.14361v2.pdf","comment":"6 pages, 7 figures and 2 tables"},{"id":"http://arxiv.org/abs/2308.03688v1","updated":"2023-08-07T16:08:11Z","published":"2023-08-07T16:08:11Z","title":"AgentBench: Evaluating LLMs as Agents","summary":"  Large Language Models (LLMs) are becoming increasingly smart and autonomous,\ntargeting real-world pragmatic missions beyond traditional NLP tasks. As a\nresult, there has been an urgent need to evaluate LLMs as agents on challenging\ntasks in interactive environments. We present AgentBench, a multi-dimensional\nevolving benchmark that currently consists of 8 distinct environments to assess\nLLM-as-Agent's reasoning and decision-making abilities in a multi-turn\nopen-ended generation setting. Our extensive test over 25 LLMs (including APIs\nand open-sourced models) shows that, while top commercial LLMs present a strong\nability of acting as agents in complex environments, there is a significant\ndisparity in performance between them and open-sourced competitors. It also\nserves as a component of an ongoing project with wider coverage and deeper\nconsideration towards systematic LLM evaluation. Datasets, environments, and an\nintegrated evaluation package for AgentBench are released at\nhttps://github.com/THUDM/AgentBench\n","authors":["Xiao Liu","Hao Yu","Hanchen Zhang","Yifan Xu","Xuanyu Lei","Hanyu Lai","Yu Gu","Hangliang Ding","Kaiwen Men","Kejuan Yang","Shudan Zhang","Xiang Deng","Aohan Zeng","Zhengxiao Du","Chenhui Zhang","Sheng Shen","Tianjun Zhang","Yu Su","Huan Sun","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03688v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2308.03660v1","updated":"2023-08-07T15:20:20Z","published":"2023-08-07T15:20:20Z","title":"Detecting Spells in Fantasy Literature with a Transformer Based\n  Artificial Intelligence","summary":"  Transformer architectures and models have made significant progress in\nlanguage-based tasks. In this area, is BERT one of the most widely used and\nfreely available transformer architecture. In our work, we use BERT for\ncontext-based phrase recognition of magic spells in the Harry Potter novel\nseries. Spells are a common part of active magic in fantasy novels. Typically,\nspells are used in a specific context to achieve a supernatural effect. A\nseries of investigations were conducted to see if a Transformer architecture\ncould recognize such phrases based on their context in the Harry Potter saga.\nFor our studies a pre-trained BERT model was used and fine-tuned utilising\ndifferent datasets and training methods to identify the searched context. By\nconsidering different approaches for sequence classification as well as token\nclassification, it is shown that the context of spells can be recognised.\nAccording to our investigations, the examined sequence length for fine-tuning\nand validation of the model plays a significant role in context recognition.\nBased on this, we have investigated whether spells have overarching properties\nthat allow a transfer of the neural network models to other fantasy universes\nas well. The application of our model showed promising results and is worth to\nbe deepened in subsequent studies.\n","authors":["Marcel Moravek","Alexander Zender","Andreas Müller"],"pdf_url":"https://arxiv.org/pdf/2308.03660v1.pdf","comment":"18 pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.03656v1","updated":"2023-08-07T15:18:30Z","published":"2023-08-07T15:18:30Z","title":"Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using\n  EmotionBench","summary":"  Recently, the community has witnessed the advancement of Large Language\nModels (LLMs), which have shown remarkable performance on various downstream\ntasks. Led by powerful models like ChatGPT and Claude, LLMs are revolutionizing\nhow users engage with software, assuming more than mere tools but intelligent\nassistants. Consequently, evaluating LLMs' anthropomorphic capabilities becomes\nincreasingly important in contemporary discourse. Utilizing the emotion\nappraisal theory from psychology, we propose to evaluate the empathy ability of\nLLMs, i.e., how their feelings change when presented with specific situations.\nAfter a careful and comprehensive survey, we collect a dataset containing over\n400 situations that have proven effective in eliciting the eight emotions\ncentral to our study. Categorizing the situations into 36 factors, we conduct a\nhuman evaluation involving more than 1,200 subjects worldwide. With the human\nevaluation results as references, our evaluation includes five LLMs, covering\nboth commercial and open-source models, including variations in model sizes,\nfeaturing the latest iterations, such as GPT-4 and LLaMA 2. A conclusion can be\ndrawn from the results that, despite several misalignments, LLMs can generally\nrespond appropriately to certain situations. Nevertheless, they fall short in\nalignment with the emotional behaviors of human beings and cannot establish\nconnections between similar situations. Our collected dataset of situations,\nthe human evaluation results, and the code of our testing framework, dubbed\nEmotionBench, is made publicly in https://github.com/CUHK-ARISE/EmotionBench.\nWe aspire to contribute to the advancement of LLMs regarding better alignment\nwith the emotional behaviors of human beings, thereby enhancing their utility\nand applicability as intelligent assistants.\n","authors":["Jen-tse Huang","Man Ho Lam","Eric John Li","Shujie Ren","Wenxuan Wang","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.03656v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.00121v2","updated":"2023-08-07T14:57:11Z","published":"2023-07-24T19:59:22Z","title":"Getting pwn'd by AI: Penetration Testing with Large Language Models","summary":"  The field of software security testing, more specifically penetration\ntesting, is an activity that requires high levels of expertise and involves\nmany manual testing and analysis steps. This paper explores the potential usage\nof large-language models, such as GPT3.5, to augment penetration testers with\nAI sparring partners. We explore the feasibility of supplementing penetration\ntesters with AI models for two distinct use cases: high-level task planning for\nsecurity testing assignments and low-level vulnerability hunting within a\nvulnerable virtual machine. For the latter, we implemented a closed-feedback\nloop between LLM-generated low-level actions with a vulnerable virtual machine\n(connected through SSH) and allowed the LLM to analyze the machine state for\nvulnerabilities and suggest concrete attack vectors which were automatically\nexecuted within the virtual machine. We discuss promising initial results,\ndetail avenues for improvement, and close deliberating on the ethics of\nproviding AI-based sparring partners.\n","authors":["Andreas Happe","Jürgen Cito"],"pdf_url":"https://arxiv.org/pdf/2308.00121v2.pdf","comment":"5 pages, 1 figure, vision paper FSE'23"},{"id":"http://arxiv.org/abs/2308.03638v1","updated":"2023-08-07T14:42:49Z","published":"2023-08-07T14:42:49Z","title":"KITLM: Domain-Specific Knowledge InTegration into Language Models for\n  Question Answering","summary":"  Large language models (LLMs) have demonstrated remarkable performance in a\nwide range of natural language tasks. However, as these models continue to grow\nin size, they face significant challenges in terms of computational costs.\nAdditionally, LLMs often lack efficient domain-specific understanding, which is\nparticularly crucial in specialized fields such as aviation and healthcare. To\nboost the domain-specific understanding, we propose, KITLM, a novel knowledge\nbase integration approach into language model through relevant information\ninfusion. By integrating pertinent knowledge, not only the performance of the\nlanguage model is greatly enhanced, but the model size requirement is also\nsignificantly reduced while achieving comparable performance. Our proposed\nknowledge-infused model surpasses the performance of both GPT-3.5-turbo and the\nstate-of-the-art knowledge infusion method, SKILL, achieving over 1.5 times\nimprovement in exact match scores on the MetaQA. KITLM showed a similar\nperformance boost in the aviation domain with AeroQA. The drastic performance\nimprovement of KITLM over the existing methods can be attributed to the\ninfusion of relevant knowledge while mitigating noise. In addition, we release\ntwo curated datasets to accelerate knowledge infusion research in specialized\nfields: a) AeroQA, a new benchmark dataset designed for multi-hop\nquestion-answering within the aviation domain, and b) Aviation Corpus, a\ndataset constructed from unstructured text extracted from the National\nTransportation Safety Board reports. Our research contributes to advancing the\nfield of domain-specific language understanding and showcases the potential of\nknowledge infusion techniques in improving the performance of language models\non question-answering.\n","authors":["Ankush Agarwal","Sakharam Gawade","Amar Prakash Azad","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2308.03638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v1","updated":"2023-08-07T14:36:03Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v1.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2308.03601v1","updated":"2023-08-07T14:04:15Z","published":"2023-08-07T14:04:15Z","title":"Negative Lexical Constraints in Neural Machine Translation","summary":"  This paper explores negative lexical constraining in English to Czech neural\nmachine translation. Negative lexical constraining is used to prohibit certain\nwords or expressions in the translation produced by the neural translation\nmodel. We compared various methods based on modifying either the decoding\nprocess or the training data. The comparison was performed on two tasks:\nparaphrasing and feedback-based translation refinement. We also studied to\nwhich extent these methods \"evade\" the constraints presented to the model\n(usually in the dictionary form) by generating a different surface form of a\ngiven constraint.We propose a way to mitigate the issue through training with\nstemmed negative constraints to counter the model's ability to induce a variety\nof the surface forms of a word that can result in bypassing the constraint. We\ndemonstrate that our method improves the constraining, although the problem\nstill persists in many cases.\n","authors":["Josef Jon","Dušan Variš","Michal Novák","João Paulo Aires","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2308.03601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03582v1","updated":"2023-08-07T13:38:54Z","published":"2023-08-07T13:38:54Z","title":"WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs Dataset","summary":"  A fundamental challenge in the current NLP context, dominated by language\nmodels, comes from the inflexibility of current architectures to 'learn' new\ninformation. While model-centric solutions like continual learning or\nparameter-efficient fine tuning are available, the question still remains of\nhow to reliably identify changes in language or in the world. In this paper, we\npropose WikiTiDe, a dataset derived from pairs of timestamped definitions\nextracted from Wikipedia. We argue that such resource can be helpful for\naccelerating diachronic NLP, specifically, for training models able to scan\nknowledge resources for core updates concerning a concept, an event, or a named\nentity. Our proposed end-to-end method is fully automatic, and leverages a\nbootstrapping algorithm for gradually creating a high-quality dataset. Our\nresults suggest that bootstrapping the seed version of WikiTiDe leads to better\nfine-tuned models. We also leverage fine-tuned models in a number of downstream\ntasks, showing promising results with respect to competitive baselines.\n","authors":["Hsuvas Borkakoty","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03582v1.pdf","comment":"Accepted by RANLP 2023 main conference"},{"id":"http://arxiv.org/abs/2308.03581v1","updated":"2023-08-07T13:37:05Z","published":"2023-08-07T13:37:05Z","title":"Towards Controllable Natural Language Inference through Lexical\n  Inference Types","summary":"  Explainable natural language inference aims to provide a mechanism to produce\nexplanatory (abductive) inference chains which ground claims to their\nsupporting premises. A recent corpus called EntailmentBank strives to advance\nthis task by explaining the answer to a question using an entailment tree\n\\cite{dalvi2021explaining}. They employ the T5 model to directly generate the\ntree, which can explain how the answer is inferred. However, it lacks the\nability to explain and control the generation of intermediate steps, which is\ncrucial for the multi-hop inference process. % One recent corpus,\nEntailmentBank, aims to push this task forward by explaining an answer to a\nquestion according to an entailment tree \\cite{dalvi2021explaining}. They\nemploy T5 to generate the tree directly, which can explain how the answer is\ninferred but cannot explain how the intermediate is generated, which is\nessential to the multi-hop inference process. In this work, we focus on\nproposing a controlled natural language inference architecture for\nmulti-premise explanatory inference. To improve control and enable explanatory\nanalysis over the generation, we define lexical inference types based on\nAbstract Meaning Representation (AMR) graph and modify the architecture of T5\nto learn a latent sentence representation (T5 bottleneck) conditioned on said\ntype information. We also deliver a dataset of approximately 5000 annotated\nexplanatory inference steps, with well-grounded lexical-symbolic operations.\nExperimental results indicate that the inference typing induced at the T5\nbottleneck can help T5 to generate a conclusion under explicit control.\n","authors":["Yingji Zhang","Danilo S. Carvalho","Ian Pratt-Hartmann","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.03581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12375v2","updated":"2023-08-07T13:22:01Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03565v1","updated":"2023-08-07T13:16:42Z","published":"2023-08-07T13:16:42Z","title":"Topological Interpretations of GPT-3","summary":"  This is an experiential study of investigating a consistent method for\nderiving the correlation between sentence vector and semantic meaning of a\nsentence. We first used three state-of-the-art word/sentence embedding methods\nincluding GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence\nstrings into high dimensional spaces. Then we compute the pairwise distance\nbetween any possible combination of two sentence vectors in an embedding space\nand map them into a matrix. Based on each distance matrix, we compute the\ncorrelation of distances of a sentence vector with respect to the other\nsentence vectors in an embedding space. Then we compute the correlation of each\npair of the distance matrices. We observed correlations of the same sentence in\ndifferent embedding spaces and correlations of different sentences in the same\nembedding space. These observations are consistent with our hypothesis and take\nus to the next stage.\n","authors":["Tianyi Sun","Bradley Nelson"],"pdf_url":"https://arxiv.org/pdf/2308.03565v1.pdf","comment":"70 pages"},{"id":"http://arxiv.org/abs/2308.03558v1","updated":"2023-08-07T13:10:35Z","published":"2023-08-07T13:10:35Z","title":"Mondrian: Prompt Abstraction Attack Against Large Language Models for\n  Cheaper API Pricing","summary":"  The Machine Learning as a Service (MLaaS) market is rapidly expanding and\nbecoming more mature. For example, OpenAI's ChatGPT is an advanced large\nlanguage model (LLM) that generates responses for various queries with\nassociated fees. Although these models can deliver satisfactory performance,\nthey are far from perfect. Researchers have long studied the vulnerabilities\nand limitations of LLMs, such as adversarial attacks and model toxicity.\nInevitably, commercial ML models are also not exempt from such issues, which\ncan be problematic as MLaaS continues to grow. In this paper, we discover a new\nattack strategy against LLM APIs, namely the prompt abstraction attack.\nSpecifically, we propose Mondrian, a simple and straightforward method that\nabstracts sentences, which can lower the cost of using LLM APIs. In this\napproach, the adversary first creates a pseudo API (with a lower established\nprice) to serve as the proxy of the target API (with a higher established\nprice). Next, the pseudo API leverages Mondrian to modify the user query,\nobtain the abstracted response from the target API, and forward it back to the\nend user. Our results show that Mondrian successfully reduces user queries'\ntoken length ranging from 13% to 23% across various tasks, including text\nclassification, generation, and question answering. Meanwhile, these abstracted\nqueries do not significantly affect the utility of task-specific and general\nlanguage models like ChatGPT. Mondrian also reduces instruction prompts' token\nlength by at least 11% without compromising output quality. As a result, the\nprompt abstraction attack enables the adversary to profit without bearing the\ncost of API development and deployment.\n","authors":["Wai Man Si","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03549v1","updated":"2023-08-07T12:56:13Z","published":"2023-08-07T12:56:13Z","title":"Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language\n  Model through Expert Feedback and Real-world Multi-turn Dialogue","summary":"  Recent advances in Large Language Models (LLMs) have achieved remarkable\nbreakthroughs in understanding and responding to user intents. However, their\nperformance lag behind general use cases in some expertise domains, such as\nChinese medicine. Existing efforts to incorporate Chinese medicine into LLMs\nrely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue\ndata. These models lack the ability for doctor-like proactive inquiry and\nmulti-turn comprehension and cannot always align responses with safety and\nprofessionalism experts. In this work, we introduce Zhongjing, the first\nChinese medical LLaMA-based LLM that implements an entire training pipeline\nfrom pre-training to reinforcement learning with human feedback (RLHF).\nAdditionally, we introduce a Chinese multi-turn medical dialogue dataset of\n70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly\nenhances the model's capability for complex dialogue and proactive inquiry\ninitiation. We define a refined annotation rule and evaluation criteria given\nthe biomedical domain's unique characteristics. Results show that our model\noutperforms baselines in various capacities and matches the performance of\nChatGPT in a few abilities, despite having 50x training data with previous best\nmodel and 100x parameters with ChatGPT. RLHF further improves the model's\ninstruction-following ability and safety. We also release our code, datasets\nand model for further research.\n","authors":["Songhua Yang","Hanjia Zhao","Senbin Zhu","Guangyu Zhou","Hongfei Xu","Yuxiang Jia","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2308.03549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03531v1","updated":"2023-08-07T12:30:00Z","published":"2023-08-07T12:30:00Z","title":"Measuring Variety, Balance, and Disparity: An Analysis of Media Coverage\n  of the 2021 German Federal Election","summary":"  Determining and measuring diversity in news articles is important for a\nnumber of reasons, including preventing filter bubbles and fueling public\ndiscourse, especially before elections. So far, the identification and analysis\nof diversity have been illuminated in a variety of ways, such as measuring the\noverlap of words or topics between news articles related to US elections.\nHowever, the question of how diversity in news articles can be measured\nholistically, i.e., with respect to (1) variety, (2) balance, and (3)\ndisparity, considering individuals, parties, and topics, has not been\naddressed. In this paper, we present a framework for determining diversity in\nnews articles according to these dimensions. Furthermore, we create and provide\na dataset of Google Top Stories, encompassing more than 26,000 unique headlines\nfrom more than 900 news outlets collected within two weeks before and after the\n2021 German federal election. While we observe high diversity for more general\nsearch terms (e.g., \"election\"), a range of search terms (\"education,\"\n\"Europe,\" \"climate protection,\" \"government\") resulted in news articles with\nhigh diversity in two out of three dimensions. This reflects a more subjective,\ndedicated discussion on rather future-oriented topics.\n","authors":["Michael Färber","Jannik Schwade","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2308.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03519v1","updated":"2023-08-07T12:13:25Z","published":"2023-08-07T12:13:25Z","title":"Vocab-Expander: A System for Creating Domain-Specific Vocabularies Based\n  on Word Embeddings","summary":"  In this paper, we propose Vocab-Expander at https://vocab-expander.com, an\nonline tool that enables end-users (e.g., technology scouts) to create and\nexpand a vocabulary of their domain of interest. It utilizes an ensemble of\nstate-of-the-art word embedding techniques based on web text and ConceptNet, a\ncommon-sense knowledge base, to suggest related terms for already given terms.\nThe system has an easy-to-use interface that allows users to quickly confirm or\nreject term suggestions. Vocab-Expander offers a variety of potential use\ncases, such as improving concept-based information retrieval in technology and\ninnovation management, enhancing communication and collaboration within\norganizations or interdisciplinary projects, and creating vocabularies for\nspecific courses in education.\n","authors":["Michael Färber","Nicholas Popovic"],"pdf_url":"https://arxiv.org/pdf/2308.03519v1.pdf","comment":"accepted at RANLP'23"},{"id":"http://arxiv.org/abs/2307.00925v4","updated":"2023-08-07T11:40:59Z","published":"2023-07-03T10:53:05Z","title":"Automatic Design of Semantic Similarity Ensembles Using Grammatical\n  Evolution","summary":"  Semantic similarity measures are widely used in natural language processing\nto catalyze various computer-related tasks. However, no single semantic\nsimilarity measure is the most appropriate for all tasks, and researchers often\nuse ensemble strategies to ensure performance. This research work proposes a\nmethod for automatically designing semantic similarity ensembles. In fact, our\nproposed method uses grammatical evolution, for the first time, to\nautomatically select and aggregate measures from a pool of candidates to create\nan ensemble that maximizes correlation to human judgment. The method is\nevaluated on several benchmark datasets and compared to state-of-the-art\nensembles, showing that it can significantly improve similarity assessment\naccuracy and outperform existing methods in some cases. As a result, our\nresearch demonstrates the potential of using grammatical evolution to\nautomatically compare text and prove the benefits of using ensembles for\nsemantic similarity tasks. The source code that illustrates our approach can be\ndownloaded from https://github.com/jorge-martinez-gil/sesige.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.00925v4.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2211.08264v2","updated":"2023-08-07T11:22:16Z","published":"2022-11-15T16:14:39Z","title":"QAmeleon: Multilingual QA with Only 5 Examples","summary":"  The availability of large, high-quality datasets has been one of the main\ndrivers of recent progress in question answering (QA). Such annotated datasets\nhowever are difficult and costly to collect, and rarely exist in languages\nother than English, rendering QA technology inaccessible to underrepresented\nlanguages. An alternative to building large monolingual training datasets is to\nleverage pre-trained language models (PLMs) under a few-shot learning setting.\nOur approach, QAmeleon, uses a PLM to automatically generate multilingual data\nupon which QA models are trained, thus avoiding costly annotation. Prompt\ntuning the PLM for data synthesis with only five examples per language delivers\naccuracy superior to translation-based baselines, bridges nearly 60% of the gap\nbetween an English-only baseline and a fully supervised upper bound trained on\nalmost 50,000 hand labeled examples, and always leads to substantial\nimprovements compared to fine-tuning a QA model directly on labeled examples in\nlow resource settings. Experiments on the TyDiQA-GoldP and MLQA benchmarks show\nthat few-shot prompt tuning for data synthesis scales across languages and is a\nviable alternative to large-scale annotation.\n","authors":["Priyanka Agrawal","Chris Alberti","Fantine Huot","Joshua Maynez","Ji Ma","Sebastian Ruder","Kuzman Ganchev","Dipanjan Das","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2211.08264v2.pdf","comment":"To Appear at Transactions of Association for Computational\n  Linguistics (TACL)"},{"id":"http://arxiv.org/abs/2301.05880v2","updated":"2023-08-07T10:36:44Z","published":"2023-01-14T10:18:22Z","title":"TikTalk: A Video-Based Dialogue Dataset for Multi-Modal Chitchat in Real\n  World","summary":"  To facilitate the research on intelligent and human-like chatbots with\nmulti-modal context, we introduce a new video-based multi-modal dialogue\ndataset, called TikTalk. We collect 38K videos from a popular video-sharing\nplatform, along with 367K conversations posted by users beneath them. Users\nengage in spontaneous conversations based on their multi-modal experiences from\nwatching videos, which helps recreate real-world chitchat context. Compared to\nprevious multi-modal dialogue datasets, the richer context types in TikTalk\nlead to more diverse conversations, but also increase the difficulty in\ncapturing human interests from intricate multi-modal information to generate\npersonalized responses. Moreover, external knowledge is more frequently evoked\nin our dataset. These facts reveal new challenges for multi-modal dialogue\nmodels. We quantitatively demonstrate the characteristics of TikTalk, propose a\nvideo-based multi-modal chitchat task, and evaluate several dialogue baselines.\nExperimental results indicate that the models incorporating large language\nmodels (LLM) can generate more diverse responses, while the model utilizing\nknowledge graphs to introduce external knowledge performs the best overall.\nFurthermore, no existing model can solve all the above challenges well. There\nis still a large room for future improvements, even for LLM with visual\nextensions. Our dataset is available at\n\\url{https://ruc-aimind.github.io/projects/TikTalk/}.\n","authors":["Hongpeng Lin","Ludan Ruan","Wenke Xia","Peiyu Liu","Jingyuan Wen","Yixin Xu","Di Hu","Ruihua Song","Wayne Xin Zhao","Qin Jin","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2301.05880v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03449v1","updated":"2023-08-07T10:11:42Z","published":"2023-08-07T10:11:42Z","title":"Knowledge-preserving Pruning for Pre-trained Language Models without\n  Retraining","summary":"  Given a pre-trained language model, how can we efficiently compress it\nwithout retraining? Retraining-free structured pruning algorithms are crucial\nin pre-trained language model compression due to their significantly reduced\npruning cost and capability to prune large language models. However, existing\nretraining-free algorithms encounter severe accuracy degradation, as they fail\nto preserve the useful knowledge of pre-trained models. In this paper, we\npropose K-pruning (Knowledge-preserving pruning), an accurate retraining-free\nstructured pruning algorithm for pre-trained language models. K-pruning\nidentifies and prunes attention heads and neurons deemed to be superfluous,\nbased on the amount of their inherent knowledge. K-pruning applies an iterative\nprocess of pruning followed by knowledge reconstruction for each sub-layer to\npreserve the knowledge of the pre-trained models. Consequently, K-pruning shows\nup to 58.02%p higher F1 score than existing retraining-free pruning algorithms\nunder a high compression rate of 80% on the SQuAD benchmark.\n","authors":["Seungcheol Park","Hojun Choi","U Kang"],"pdf_url":"https://arxiv.org/pdf/2308.03449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01633v2","updated":"2023-08-07T09:54:55Z","published":"2023-05-02T17:46:12Z","title":"Missing Information, Unresponsive Authors, Experimental Flaws: The\n  Impossibility of Assessing the Reproducibility of Previous Human Evaluations\n  in NLP","summary":"  We report our efforts in identifying a set of previous human evaluations in\nNLP that would be suitable for a coordinated study examining what makes human\nevaluations in NLP more/less reproducible. We present our results and findings,\nwhich include that just 13\\% of papers had (i) sufficiently low barriers to\nreproduction, and (ii) enough obtainable information, to be considered for\nreproduction, and that all but one of the experiments we selected for\nreproduction was discovered to have flaws that made the meaningfulness of\nconducting a reproduction questionable. As a result, we had to change our\ncoordinated study design from a reproduce approach to a\nstandardise-then-reproduce-twice approach. Our overall (negative) finding that\nthe great majority of human evaluations in NLP is not repeatable and/or not\nreproducible and/or too flawed to justify reproduction, paints a dire picture,\nbut presents an opportunity for a rethink about how to design and report human\nevaluations in NLP.\n","authors":["Anya Belz","Craig Thomson","Ehud Reiter","Gavin Abercrombie","Jose M. Alonso-Moral","Mohammad Arvan","Anouck Braggaar","Mark Cieliebak","Elizabeth Clark","Kees van Deemter","Tanvi Dinkar","Ondřej Dušek","Steffen Eger","Qixiang Fang","Mingqi Gao","Albert Gatt","Dimitra Gkatzia","Javier González-Corbelle","Dirk Hovy","Manuela Hürlimann","Takumi Ito","John D. Kelleher","Filip Klubicka","Emiel Krahmer","Huiyuan Lai","Chris van der Lee","Yiru Li","Saad Mahamood","Margot Mieskes","Emiel van Miltenburg","Pablo Mosteiro","Malvina Nissim","Natalie Parde","Ondřej Plátek","Verena Rieser","Jie Ruan","Joel Tetreault","Antonio Toral","Xiaojun Wan","Leo Wanner","Lewis Watson","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.01633v2.pdf","comment":"5 pages plus appendix, 4 tables, 1 figure. To appear at \"Workshop on\n  Insights from Negative Results in NLP\" (co-located with EACL2023). Updated\n  author list and acknowledgements"},{"id":"http://arxiv.org/abs/2308.03429v1","updated":"2023-08-07T09:24:24Z","published":"2023-08-07T09:24:24Z","title":"RCMHA: Relative Convolutional Multi-Head Attention for Natural Language\n  Modelling","summary":"  The Attention module finds common usage in language modeling, presenting\ndistinct challenges within the broader scope of Natural Language Processing.\nMulti-Head Attention (MHA) employs an absolute positional encoding, which\nimposes limitations on token length and entails substantial memory consumption\nduring the processing of embedded inputs. The current remedy proposed by\nresearchers involves the utilization of relative positional encoding, similar\nto the approach adopted in Transformer-XL or Relative Multi-Head Attention\n(RMHA), albeit the employed architecture consumes considerable memory\nresources. To address these challenges, this study endeavors to refine MHA,\nleveraging relative positional encoding in conjunction with the Depth-Wise\nConvolutional Layer architecture, which promises heightened accuracy coupled\nwith minimized memory usage. The proposed RCMHA framework entails the\nmodification of two integral components: firstly, the application of the\nDepth-Wise Convolutional Layer to the input embedding, encompassing Query, Key,\nand Value parameters; secondly, the incorporation of Relative Positional\nEncoding into the attention scoring phase, harmoniously integrated with Scaled\nDot-Product Attention. Empirical experiments underscore the advantages of\nRCMHA, wherein it exhibits superior accuracy, boasting a score of 0.572 in\ncomparison to alternative attention modules such as MHA, Multi-DConv-Head\nAttention (MDHA), and RMHA. Concerning memory utilization, RMHA emerges as the\nmost frugal, demonstrating an average consumption of 2.98 GB, surpassing RMHA\nwhich necessitates 3.5 GB.\n","authors":["Herman Sugiharto"," Aradea","Husni Mubarok"],"pdf_url":"https://arxiv.org/pdf/2308.03429v1.pdf","comment":"13 pages, 13 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.03423v1","updated":"2023-08-07T09:19:59Z","published":"2023-08-07T09:19:59Z","title":"Boosting Chinese ASR Error Correction with Dynamic Error Scaling\n  Mechanism","summary":"  Chinese Automatic Speech Recognition (ASR) error correction presents\nsignificant challenges due to the Chinese language's unique features, including\na large character set and borderless, morpheme-based structure. Current\nmainstream models often struggle with effectively utilizing word-level features\nand phonetic information. This paper introduces a novel approach that\nincorporates a dynamic error scaling mechanism to detect and correct\nphonetically erroneous text generated by ASR output. This mechanism operates by\ndynamically fusing word-level features and phonetic information, thereby\nenriching the model with additional semantic data. Furthermore, our method\nimplements unique error reduction and amplification strategies to address the\nissues of matching wrong words caused by incorrect characters. Experimental\nresults indicate substantial improvements in ASR error correction,\ndemonstrating the effectiveness of our proposed method and yielding promising\nresults on established datasets.\n","authors":["Jiaxin Fan","Yong Zhang","Hanzhang Li","Jianzong Wang","Zhitao Li","Sheng Ouyang","Ning Cheng","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03423v1.pdf","comment":"Accepted by 24th Annual Conference of the International Speech\n  Communication Association (INTERSPEECH 2023)"},{"id":"http://arxiv.org/abs/2306.11518v2","updated":"2023-08-07T09:17:43Z","published":"2023-06-20T13:12:58Z","title":"One model to rule them all: ranking Slovene summarizers","summary":"  Text summarization is an essential task in natural language processing, and\nresearchers have developed various approaches over the years, ranging from\nrule-based systems to neural networks. However, there is no single model or\napproach that performs well on every type of text. We propose a system that\nrecommends the most suitable summarization model for a given text. The proposed\nsystem employs a fully connected neural network that analyzes the input content\nand predicts which summarizer should score the best in terms of ROUGE score for\na given input. The meta-model selects among four different summarization\nmodels, developed for the Slovene language, using different properties of the\ninput, in particular its Doc2Vec document representation. The four Slovene\nsummarization models deal with different challenges associated with text\nsummarization in a less-resourced language. We evaluate the proposed SloMetaSum\nmodel performance automatically and parts of it manually. The results show that\nthe system successfully automates the step of manually selecting the best\nmodel.\n","authors":["Aleš Žagar","Marko Robnik-Šikonja"],"pdf_url":"https://arxiv.org/pdf/2306.11518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03422v1","updated":"2023-08-07T09:15:03Z","published":"2023-08-07T09:15:03Z","title":"Prompt Guided Copy Mechanism for Conversational Question Answering","summary":"  Conversational Question Answering (CQA) is a challenging task that aims to\ngenerate natural answers for conversational flow questions. In this paper, we\npropose a pluggable approach for extractive methods that introduces a novel\nprompt-guided copy mechanism to improve the fluency and appropriateness of the\nextracted answers. Our approach uses prompts to link questions to answers and\nemploys attention to guide the copy mechanism to verify the naturalness of\nextracted answers, making necessary edits to ensure that the answers are fluent\nand appropriate. The three prompts, including a question-rationale relationship\nprompt, a question description prompt, and a conversation history prompt,\nenhance the copy mechanism's performance. Our experiments demonstrate that this\napproach effectively promotes the generation of natural answers and achieves\ngood results in the CoQA challenge.\n","authors":["Yong Zhang","Zhitao Li","Jianzong Wang","Yiming Gao","Ning Cheng","Fengying Yu","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03422v1.pdf","comment":"Accepted by 24th Annual Conference of the International Speech\n  Communication Association (INTERSPEECH 2023)"},{"id":"http://arxiv.org/abs/2308.03421v1","updated":"2023-08-07T09:14:33Z","published":"2023-08-07T09:14:33Z","title":"RecycleGPT: An Autoregressive Language Model with Recyclable Module","summary":"  Existing large language models have to run K times to generate a sequence of\nK tokens. In this paper, we present RecycleGPT, a generative language model\nwith fast decoding speed by recycling pre-generated model states without\nrunning the whole model in multiple steps. Our approach relies on the\nobservation that adjacent tokens in a sequence usually have strong correlations\nand the next token in a sequence can be reasonably guessed or inferred based on\nthe preceding ones. Through theoretical evaluations and practical tests on\ndownstream text generation tasks, we demonstrate the effectiveness of our\napproach in lowering inference latency, achieving up to 1.4x speedup while\npreserving high performance.\n","authors":["Yufan Jiang","Qiaozhi He","Xiaomin Zhuang","Zhihua Wu","Kunpeng Wang","Wenlai Zhao","Guangwen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03421v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.10511v2","updated":"2023-08-07T09:08:23Z","published":"2023-07-20T00:36:41Z","title":"General Debiasing for Multimodal Sentiment Analysis","summary":"  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal\ninformation for prediction yet unavoidably suffers from fitting the spurious\ncorrelations between multimodal features and sentiment labels. For example, if\nmost videos with a blue background have positive labels in a dataset, the model\nwill rely on such correlations for prediction, while \"blue background\" is not a\nsentiment-related feature. To address this problem, we define a general\ndebiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)\ngeneralization ability of MSA models by reducing their reliance on spurious\ncorrelations. To this end, we propose a general debiasing framework based on\nInverse Probability Weighting (IPW), which adaptively assigns small weights to\nthe samples with larger bias (i.e., the severer spurious correlations). The key\nto this debiasing framework is to estimate the bias of each sample, which is\nachieved by two steps: 1) disentangling the robust features and biased features\nin each modality, and 2) utilizing the biased features to estimate the bias.\nFinally, we employ IPW to reduce the effects of large-biased samples,\nfacilitating robust feature learning for sentiment prediction. To examine the\nmodel's generalization ability, we keep the original testing sets on two\nbenchmarks and additionally construct multiple unimodal and multimodal OOD\ntesting sets. The empirical results demonstrate the superior generalization\nability of our proposed framework. We have released the code and data to\nfacilitate the reproduction https://github.com/Teng-Sun/GEAR.\n","authors":["Teng Sun","Juntong Ni","Wenjie Wang","Liqiang Jing","Yinwei Wei","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2307.10511v2.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03415v1","updated":"2023-08-07T09:06:20Z","published":"2023-08-07T09:06:20Z","title":"End-to-End Evaluation for Low-Latency Simultaneous Speech Translation","summary":"  The challenge of low-latency speech translation has recently draw significant\ninterest in the research community as shown by several publications and shared\ntasks. Therefore, it is essential to evaluate these different approaches in\nrealistic scenarios. However, currently only specific aspects of the systems\nare evaluated and often it is not possible to compare different approaches.\n  In this work, we propose the first framework to perform and evaluate the\nvarious aspects of low-latency speech translation under realistic conditions.\nThe evaluation is carried out in an end-to-end fashion. This includes the\nsegmentation of the audio as well as the run-time of the different components.\n  Secondly, we compare different approaches to low-latency speech translation\nusing this framework. We evaluate models with the option to revise the output\nas well as methods with fixed output. Furthermore, we directly compare\nstate-of-the-art cascaded as well as end-to-end systems. Finally, the framework\nallows to automatically evaluate the translation quality as well as latency and\nalso provides a web interface to show the low-latency model outputs to the\nuser.\n","authors":["Christian Huber","Tu Anh Dinh","Carlos Mullov","Ngoc Quan Pham","Thai Binh Nguyen","Fabian Retkowski","Stefan Constantin","Enes Yavuz Ugan","Danni Liu","Zhaolin Li","Sai Koneru","Jan Niehues","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2308.03415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08283v3","updated":"2023-08-07T08:32:54Z","published":"2022-12-16T05:10:09Z","title":"SceneGATE: Scene-Graph based co-Attention networks for TExt visual\n  question answering","summary":"  Most TextVQA approaches focus on the integration of objects, scene texts and\nquestion words by a simple transformer encoder. But this fails to capture the\nsemantic relations between different modalities. The paper proposes a Scene\nGraph based co-Attention Network (SceneGATE) for TextVQA, which reveals the\nsemantic relations among the objects, Optical Character Recognition (OCR)\ntokens and the question words. It is achieved by a TextVQA-based scene graph\nthat discovers the underlying semantics of an image. We created a\nguided-attention module to capture the intra-modal interplay between the\nlanguage and the vision as a guidance for inter-modal interactions. To make\nexplicit teaching of the relations between the two modalities, we proposed and\nintegrated two attention modules, namely a scene graph-based semantic\nrelation-aware attention and a positional relation-aware attention. We\nconducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.\nIt is shown that our SceneGATE method outperformed existing ones because of the\nscene graph and its attention modules.\n","authors":["Feiqi Cao","Siwen Luo","Felipe Nunez","Zean Wen","Josiah Poon","Caren Han"],"pdf_url":"https://arxiv.org/pdf/2212.08283v3.pdf","comment":"Published in Robotics (Q1, SCI indexed Journal):\n  https://www.mdpi.com/2218-6581/12/4/114"},{"id":"http://arxiv.org/abs/2207.14116v4","updated":"2023-08-07T07:54:45Z","published":"2022-07-28T14:30:06Z","title":"Claim-Dissector: An Interpretable Fact-Checking System with Joint\n  Re-ranking and Veracity Prediction","summary":"  We present Claim-Dissector: a novel latent variable model for fact-checking\nand analysis, which given a claim and a set of retrieved evidences jointly\nlearns to identify: (i) the relevant evidences to the given claim, (ii) the\nveracity of the claim. We propose to disentangle the per-evidence relevance\nprobability and its contribution to the final veracity probability in an\ninterpretable way -- the final veracity probability is proportional to a linear\nensemble of per-evidence relevance probabilities. In this way, the individual\ncontributions of evidences towards the final predicted probability can be\nidentified. In per-evidence relevance probability, our model can further\ndistinguish whether each relevant evidence is supporting (S) or refuting (R)\nthe claim. This allows to quantify how much the S/R probability contributes to\nthe final verdict or to detect disagreeing evidence.\n  Despite its interpretable nature, our system achieves results competitive\nwith state-of-the-art on the FEVER dataset, as compared to typical two-stage\nsystem pipelines, while using significantly fewer parameters. It also sets new\nstate-of-the-art on FAVIQ and RealFC datasets. Furthermore, our analysis shows\nthat our model can learn fine-grained relevance cues while using coarse-grained\nsupervision, and we demonstrate it in 2 ways. (i) We show that our model can\nachieve competitive sentence recall while using only paragraph-level relevance\nsupervision. (ii) Traversing towards the finest granularity of relevance, we\nshow that our model is capable of identifying relevance at the token level. To\ndo this, we present a new benchmark TLR-FEVER focusing on token-level\ninterpretability -- humans annotate tokens in relevant evidences they\nconsidered essential when making their judgment. Then we measure how similar\nare these annotations to the tokens our model is focusing on.\n","authors":["Martin Fajcik","Petr Motlicek","Pavel Smrz"],"pdf_url":"https://arxiv.org/pdf/2207.14116v4.pdf","comment":"updated acknowledgement"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.03365v1","updated":"2023-08-07T07:39:43Z","published":"2023-08-07T07:39:43Z","title":"Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine\n  Lexicon-based Retriever","summary":"  Few-shot and zero-shot entity linking focus on the tail and emerging\nentities, which are more challenging but closer to real-world scenarios. The\nmainstream method is the ''retrieve and rerank'' two-stage framework. In this\npaper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity\ncandidates in an effective manner, which operates in two layers. The first\nlayer retrieves coarse-grained candidates by leveraging entity names, while the\nsecond layer narrows down the search to fine-grained candidates within the\ncoarse-grained ones. In addition, this second layer utilizes entity\ndescriptions to effectively disambiguate tail or new entities that share names\nwith existing popular entities. Experimental results indicate that our approach\ncan obtain superior performance without requiring extensive finetuning in the\nretrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task\n6 on Chinese Few-shot and Zero-shot Entity Linking.\n","authors":["Shijue Huang","Bingbing Wang","Libo Qin","Qin Zhao","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03365v1.pdf","comment":"Accepted to NLPCC2023"},{"id":"http://arxiv.org/abs/2308.03360v1","updated":"2023-08-07T07:29:49Z","published":"2023-08-07T07:29:49Z","title":"Coupling Symbolic Reasoning with Language Modeling for Efficient\n  Longitudinal Understanding of Unstructured Electronic Medical Records","summary":"  The application of Artificial Intelligence (AI) in healthcare has been\nrevolutionary, especially with the recent advancements in transformer-based\nLarge Language Models (LLMs). However, the task of understanding unstructured\nelectronic medical records remains a challenge given the nature of the records\n(e.g., disorganization, inconsistency, and redundancy) and the inability of\nLLMs to derive reasoning paradigms that allow for comprehensive understanding\nof medical variables. In this work, we examine the power of coupling symbolic\nreasoning with language modeling toward improved understanding of unstructured\nclinical texts. We show that such a combination improves the extraction of\nseveral medical variables from unstructured records. In addition, we show that\nthe state-of-the-art commercially-free LLMs enjoy retrieval capabilities\ncomparable to those provided by their commercial counterparts. Finally, we\nelaborate on the need for LLM steering through the application of symbolic\nreasoning as the exclusive use of LLMs results in the lowest performance.\n","authors":["Shivani Shekhar","Simran Tiwari","T. C. Rensink","Ramy Eskander","Wael Salloum"],"pdf_url":"https://arxiv.org/pdf/2308.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03349v1","updated":"2023-08-07T07:03:49Z","published":"2023-08-07T07:03:49Z","title":"SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering\n  Dataset for Scientific Graphs","summary":"  In this work, we present SciGraphQA, a synthetic multi-turn question-answer\ndataset related to academic graphs. SciGraphQA is 13 times larger than\nChartVQA, the previously largest chart-visual question-answering dataset. It is\nalso the largest open-sourced chart VQA dataset with non-synthetic charts. To\nbuild our dataset, we selected 290,000 Computer Science or Machine Learning\nArXiv papers published between 2010 and 2020, and then used Palm-2 to generate\n295K samples of open-vocabulary multi-turn question-answering dialogues about\nthe graphs. As context, we provided the text-only Palm-2 with paper title,\nabstract, paragraph mentioning the graph, and rich text contextual data from\nthe graph itself, obtaining dialogues with an average 2.23 question-answer\nturns for each graph. We asked GPT-4 to assess the matching quality of our\nquestion-answer turns given the paper's context, obtaining an average rating of\n8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most\npopular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our\ndataset, finding LLaVA-13B being the most performant with a CIDEr score of\n0.08. We further enriched the question prompts for LLAVA by including the\nserialized data tables extracted from the graphs using the DePlot model,\nboosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,\nwe also fine-tuned LLaVa using our dataset, reaching a substantially higher\nCIDEr score of 0.26. We anticipate further accuracy improvement by including\nsegmentation mask tokens and leveraging larger LLM backbones coupled with\nemergent prompting techniques. Our code and data are open-sourced.\n","authors":["Shengzhi Li","Nima Tajbakhsh"],"pdf_url":"https://arxiv.org/pdf/2308.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03960v2","updated":"2023-08-07T06:35:25Z","published":"2023-05-06T07:06:47Z","title":"Beyond Rule-based Named Entity Recognition and Relation Extraction for\n  Process Model Generation from Natural Language Text","summary":"  Process-aware information systems offer extensive advantages to companies,\nfacilitating planning, operations, and optimization of day-to-day business\nactivities. However, the time-consuming but required step of designing formal\nbusiness process models often hampers the potential of these systems. To\novercome this challenge, automated generation of business process models from\nnatural language text has emerged as a promising approach to expedite this\nstep. Generally two crucial subtasks have to be solved: extracting\nprocess-relevant information from natural language and creating the actual\nmodel. Approaches towards the first subtask are rule based methods, highly\noptimized for specific domains, but hard to adapt to related applications. To\nsolve this issue, we present an extension to an existing pipeline, to make it\nentirely data driven. We demonstrate the competitiveness of our improved\npipeline, which not only eliminates the substantial overhead associated with\nfeature engineering and rule definition, but also enables adaptation to\ndifferent datasets, entity and relation types, and new domains. Additionally,\nthe largest available dataset (PET) for the first subtask, contains no\ninformation about linguistic references between mentions of entities in the\nprocess description. Yet, the resolution of these mentions into a single visual\nelement is essential for high quality process models. We propose an extension\nto the PET dataset that incorporates information about linguistic references\nand a corresponding method for resolving them. Finally, we provide a detailed\nanalysis of the inherent challenges in the dataset at hand.\n","authors":["Julian Neuberger","Lars Ackermann","Stefan Jablonski"],"pdf_url":"https://arxiv.org/pdf/2305.03960v2.pdf","comment":"Currently under review for CoopIS23"},{"id":"http://arxiv.org/abs/2305.18462v2","updated":"2023-08-07T06:32:56Z","published":"2023-05-29T07:06:03Z","title":"Membership Inference Attacks against Language Models via Neighbourhood\n  Comparison","summary":"  Membership Inference attacks (MIAs) aim to predict whether a data sample was\npresent in the training data of a machine learning model or not, and are widely\nused for assessing the privacy risks of language models. Most existing attacks\nrely on the observation that models tend to assign higher probabilities to\ntheir training samples than non-training points. However, simple thresholding\nof the model score in isolation tends to lead to high false-positive rates as\nit does not account for the intrinsic complexity of a sample. Recent work has\ndemonstrated that reference-based attacks which compare model scores to those\nobtained from a reference model trained on similar data can substantially\nimprove the performance of MIAs. However, in order to train reference models,\nattacks of this kind make the strong and arguably unrealistic assumption that\nan adversary has access to samples closely resembling the original training\ndata. Therefore, we investigate their performance in more realistic scenarios\nand find that they are highly fragile in relation to the data distribution used\nto train reference models. To investigate whether this fragility provides a\nlayer of safety, we propose and evaluate neighbourhood attacks, which compare\nmodel scores for a given sample to scores of synthetically generated neighbour\ntexts and therefore eliminate the need for access to the training data\ndistribution. We show that, in addition to being competitive with\nreference-based attacks that have perfect knowledge about the training data\ndistribution, our attack clearly outperforms existing reference-free attacks as\nwell as reference-based attacks with imperfect knowledge, which demonstrates\nthe need for a reevaluation of the threat model of adversarial attacks.\n","authors":["Justus Mattern","Fatemehsadat Mireshghallah","Zhijing Jin","Bernhard Schölkopf","Mrinmaya Sachan","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.18462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02047v2","updated":"2023-08-07T06:21:31Z","published":"2023-07-05T06:05:36Z","title":"CAME: Confidence-guided Adaptive Memory Efficient Optimization","summary":"  Adaptive gradient methods, such as Adam and LAMB, have demonstrated excellent\nperformance in the training of large language models. Nevertheless, the need\nfor adaptivity requires maintaining second-moment estimates of the\nper-parameter gradients, which entails a high cost of extra memory overheads.\nTo solve this problem, several memory-efficient optimizers (e.g., Adafactor)\nhave been proposed to obtain a drastic reduction in auxiliary memory usage, but\nwith a performance penalty. In this paper, we first study a confidence-guided\nstrategy to reduce the instability of existing memory efficient optimizers.\nBased on this strategy, we propose CAME to simultaneously achieve two goals:\nfast convergence as in traditional adaptive methods, and low memory usage as in\nmemory-efficient methods. Extensive experiments demonstrate the training\nstability and superior performance of CAME across various NLP tasks such as\nBERT and GPT-2 training. Notably, for BERT pre-training on the large batch size\nof 32,768, our proposed optimizer attains faster convergence and higher\naccuracy compared with the Adam optimizer. The implementation of CAME is\npublicly available.\n","authors":["Yang Luo","Xiaozhe Ren","Zangwei Zheng","Zhuo Jiang","Xin Jiang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2307.02047v2.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2308.03311v1","updated":"2023-08-07T05:40:01Z","published":"2023-08-07T05:40:01Z","title":"CrossTalk: Enhancing Communication and Collaboration in\n  Videoconferencing with Intent Recognition from Conversational Speech","summary":"  Despite the advances and ubiquity of digital communication media such as\nvideoconferencing and virtual reality, they remain oblivious to the rich\nintentions expressed by users. Beyond transmitting audio, videos, and messages,\nwe envision digital communication media as proactive facilitators that can\nprovide unobtrusive assistance to enhance communication and collaboration.\nInformed by the results of a formative study, we propose three key design\nconcepts to explore the systematic integration of intelligence into\ncommunication and collaboration, including the panel substrate, language-based\nintent recognition, and lightweight interaction techniques. We developed\nCrossTalk, a videoconferencing system that instantiates these concepts, which\nwas found to enable a more fluid and flexible communication and collaboration\nexperience.\n","authors":["Haijun Xia","Tony Wang","Aditya Gunturu","Peiling Jiang","William Duan","Xiaoshuo Yao"],"pdf_url":"https://arxiv.org/pdf/2308.03311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03303v1","updated":"2023-08-07T05:12:27Z","published":"2023-08-07T05:12:27Z","title":"LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models\n  Fine-tuning","summary":"  The low-rank adaptation (LoRA) method can largely reduce the amount of\ntrainable parameters for fine-tuning large language models (LLMs), however, it\nstill requires expensive activation memory to update low-rank weights. Reducing\nthe number of LoRA layers or using activation recomputation could harm the\nfine-tuning performance or increase the computational overhead. In this work,\nwe present LoRA-FA, a memory-efficient fine-tuning method that reduces the\nactivation memory without performance degradation and expensive recomputation.\nLoRA-FA chooses to freeze the projection-down weight of $A$ and update the\nprojection-up weight of $B$ in each LoRA layer. It ensures the change of model\nweight reside in a low-rank space during LLMs fine-tuning, while eliminating\nthe requirement to store full-rank input activations. We conduct extensive\nexperiments across multiple model types (RoBERTa, T5, LLaMA) and model scales.\nOur results show that LoRA-FA can always achieve close fine-tuning accuracy\nacross different tasks compared to full parameter fine-tuning and LoRA.\nFurthermore, LoRA-FA can reduce the overall memory cost by up to 1.4$\\times$\ncompared to LoRA.\n","authors":["Longteng Zhang","Lin Zhang","Shaohuai Shi","Xiaowen Chu","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2308.03303v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.03296v1","updated":"2023-08-07T04:47:42Z","published":"2023-08-07T04:47:42Z","title":"Studying Large Language Model Generalization with Influence Functions","summary":"  When trying to gain better visibility into a machine learning model in order\nto understand and mitigate the associated risks, a potentially valuable source\nof evidence is: which training examples most contribute to a given behavior?\nInfluence functions aim to answer a counterfactual: how would the model's\nparameters (and hence its outputs) change if a given sequence were added to the\ntraining set? While influence functions have produced insights for small\nmodels, they are difficult to scale to large language models (LLMs) due to the\ndifficulty of computing an inverse-Hessian-vector product (IHVP). We use the\nEigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)\napproximation to scale influence functions up to LLMs with up to 52 billion\nparameters. In our experiments, EK-FAC achieves similar accuracy to traditional\ninfluence function estimators despite the IHVP computation being orders of\nmagnitude faster. We investigate two algorithmic techniques to reduce the cost\nof computing gradients of candidate training sequences: TF-IDF filtering and\nquery batching. We use influence functions to investigate the generalization\npatterns of LLMs, including the sparsity of the influence patterns, increasing\nabstraction with scale, math and programming abilities, cross-lingual\ngeneralization, and role-playing behavior. Despite many apparently\nsophisticated forms of generalization, we identify a surprising limitation:\ninfluences decay to near-zero when the order of key phrases is flipped.\nOverall, influence functions give us a powerful new tool for studying the\ngeneralization properties of LLMs.\n","authors":["Roger Grosse","Juhan Bae","Cem Anil","Nelson Elhage","Alex Tamkin","Amirhossein Tajdini","Benoit Steiner","Dustin Li","Esin Durmus","Ethan Perez","Evan Hubinger","Kamilė Lukošiūtė","Karina Nguyen","Nicholas Joseph","Sam McCandlish","Jared Kaplan","Samuel R. Bowman"],"pdf_url":"https://arxiv.org/pdf/2308.03296v1.pdf","comment":"119 pages, 47 figures, 22 tables"},{"id":"http://arxiv.org/abs/2308.03293v1","updated":"2023-08-07T04:42:36Z","published":"2023-08-07T04:42:36Z","title":"Dialogue Systems Can Generate Appropriate Responses without the Use of\n  Question Marks? -- Investigation of the Effects of Question Marks on Dialogue\n  Systems","summary":"  When individuals engage in spoken discourse, various phenomena can be\nobserved that differ from those that are apparent in text-based conversation.\nWhile written communication commonly uses a question mark to denote a query, in\nspoken discourse, queries are frequently indicated by a rising intonation at\nthe end of a sentence. However, numerous speech recognition engines do not\nappend a question mark to recognized queries, presenting a challenge when\ncreating a spoken dialogue system. Specifically, the absence of a question mark\nat the end of a sentence can impede the generation of appropriate responses to\nqueries in spoken dialogue systems. Hence, we investigate the impact of\nquestion marks on dialogue systems, with the results showing that they have a\nsignificant impact. Moreover, we analyze specific examples in an effort to\ndetermine which types of utterances have the impact on dialogue systems.\n","authors":["Tomoya Mizumoto","Takato Yamazaki","Katsumasa Yoshikawa","Masaya Ohagi","Toshiki Kawamoto","Toshinori Sato"],"pdf_url":"https://arxiv.org/pdf/2308.03293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03291v1","updated":"2023-08-07T04:20:38Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":"  The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n  SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03281v1","updated":"2023-08-07T03:52:59Z","published":"2023-08-07T03:52:59Z","title":"Towards General Text Embeddings with Multi-stage Contrastive Learning","summary":"  We present GTE, a general-purpose text embedding model trained with\nmulti-stage contrastive learning. In line with recent advancements in unifying\nvarious NLP tasks into a single format, we train a unified text embedding model\nby employing contrastive learning over a diverse mixture of datasets from\nmultiple sources. By significantly increasing the number of training data\nduring both unsupervised pre-training and supervised fine-tuning stages, we\nachieve substantial performance gains over existing embedding models. Notably,\neven with a relatively modest parameter count of 110M, GTE$_\\text{base}$\noutperforms the black-box embedding API provided by OpenAI and even surpasses\n10x larger text embedding models on the massive text embedding benchmark.\nFurthermore, without additional fine-tuning on each programming language\nindividually, our model outperforms previous best code retrievers of similar\nsize by treating code as text. In summary, our model achieves impressive\nresults by effectively harnessing multi-stage contrastive learning, offering a\npowerful and efficient text embedding model with broad applicability across\nvarious NLP and code-related tasks.\n","authors":["Zehan Li","Xin Zhang","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03279v1","updated":"2023-08-07T03:39:52Z","published":"2023-08-07T03:39:52Z","title":"UniversalNER: Targeted Distillation from Large Language Models for Open\n  Named Entity Recognition","summary":"  Large language models (LLMs) have demonstrated remarkable generalizability,\nsuch as understanding arbitrary entities and relations. Instruction tuning has\nproven effective for distilling LLMs into more cost-efficient models such as\nAlpaca and Vicuna. Yet such student models still trail the original LLMs by\nlarge margins in downstream applications. In this paper, we explore targeted\ndistillation with mission-focused instruction tuning to train student models\nthat can excel in a broad application class such as open information\nextraction. Using named entity recognition (NER) for case study, we show how\nChatGPT can be distilled into much smaller UniversalNER models for open NER.\nFor evaluation, we assemble the largest NER benchmark to date, comprising 43\ndatasets across 9 diverse domains such as biomedicine, programming, social\nmedia, law, finance. Without using any direct supervision, UniversalNER attains\nremarkable NER accuracy across tens of thousands of entity types, outperforming\ngeneral instruction-tuned models such as Alpaca and Vicuna by over 30 absolute\nF1 points in average. With a tiny fraction of parameters, UniversalNER not only\nacquires ChatGPT's capability in recognizing arbitrary entity types, but also\noutperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,\nUniversalNER even outperforms by a large margin state-of-the-art multi-task\ninstruction-tuned systems such as InstructUIE, which uses supervised NER\nexamples. We also conduct thorough ablation studies to assess the impact of\nvarious components in our distillation approach. We will release the\ndistillation recipe, data, and UniversalNER models to facilitate future\nresearch on targeted distillation.\n","authors":["Wenxuan Zhou","Sheng Zhang","Yu Gu","Muhao Chen","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.03279v1.pdf","comment":"Project page: https://universal-ner.github.io/"},{"id":"http://arxiv.org/abs/2308.03277v1","updated":"2023-08-07T03:37:31Z","published":"2023-08-07T03:37:31Z","title":"From Ambiguity to Explicitness: NLP-Assisted 5G Specification\n  Abstraction for Formal Analysis","summary":"  Formal method-based analysis of the 5G Wireless Communication Protocol is\ncrucial for identifying logical vulnerabilities and facilitating an\nall-encompassing security assessment, especially in the design phase. Natural\nLanguage Processing (NLP) assisted techniques and most of the tools are not\nwidely adopted by the industry and research community. Traditional formal\nverification through a mathematics approach heavily relied on manual logical\nabstraction prone to being time-consuming, and error-prone. The reason that the\nNLP-assisted method did not apply in industrial research may be due to the\nambiguity in the natural language of the protocol designs nature is\ncontroversial to the explicitness of formal verification. To address the\nchallenge of adopting the formal methods in protocol designs, targeting (3GPP)\nprotocols that are written in natural language, in this study, we propose a\nhybrid approach to streamline the analysis of protocols. We introduce a\ntwo-step pipeline that first uses NLP tools to construct data and then uses\nconstructed data to extract identifiers and formal properties by using the NLP\nmodel. The identifiers and formal properties are further used for formal\nanalysis. We implemented three models that take different dependencies between\nidentifiers and formal properties as criteria. Our results of the optimal model\nreach valid accuracy of 39% for identifier extraction and 42% for formal\nproperties predictions. Our work is proof of concept for an efficient procedure\nin performing formal analysis for largescale complicate specification and\nprotocol analysis, especially for 5G and nextG communications.\n","authors":["Shiyu Yuan","Jingda Yang","Sudhanshu Arya","Carlo Lipizzi","Ying Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03275v1","updated":"2023-08-07T03:34:01Z","published":"2023-08-07T03:34:01Z","title":"Adapter-based Selective Knowledge Distillation for Federated\n  Multi-domain Meeting Summarization","summary":"  Meeting summarization has emerged as a promising technique for providing\nusers with condensed summaries. However, existing work has focused on training\nmodels on centralized data, neglecting real-world scenarios where meeting data\nare infeasible to collect centrally, due to their sensitive nature. This gap\nmotivates us to explore federated learning for meeting summarization. Two\ncritical challenges impede progress. First, state-of-the-art summarizers are\nbased on parameter-heavy pre-trained models. Exchanging such a model's\nparameters across clients imposes large bandwidth costs. Second, as real-world\nmeeting data belong to various domains and are distributed across clients, they\nare instances of non-identically and independently distributed (non-IID). IID\nassumptions do not hold, which changes which forms of learning algorithms best\napply. To address this, we propose Adapter-based Federated Selective Knowledge\nDistillation (AdaFedSelecKD) for training performant client models.\nSpecifically, we develop an adapter-based summarization model where two\nadapters cooperatively facilitate learning using fewer parameters to reduce\ncommunication costs. Then, we devise a selective knowledge distillation\nstrategy, assisting clients in robustly handling domain-focused modelling on\ntheir own data, while leveraging global parameters based on non-IID data.\nExtensive experiments on the QMSum benchmark demonstrate AdaFedSelecKD can\nachieve comparable performance with powerful centralized training methods, and\nshows its generalizability and robustness.\n","authors":["Xiachong Feng","Xiaocheng Feng","Xiyuan Du","Min-Yen Kan","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2308.03275v1.pdf","comment":"This work has been submitted to the IEEE TASLP for possible\n  publication. Copyright may be transferred without notice, after which this\n  version may no longer be accessible"},{"id":"http://arxiv.org/abs/2103.00676v2","updated":"2023-08-07T03:25:37Z","published":"2021-03-01T01:00:09Z","title":"Token-Modification Adversarial Attacks for Natural Language Processing:\n  A Survey","summary":"  There are now many adversarial attacks for natural language processing\nsystems. Of these, a vast majority achieve success by modifying individual\ndocument tokens, which we call here a token-modification attack. Each\ntoken-modification attack is defined by a specific combination of fundamental\ncomponents, such as a constraint on the adversary or a particular search\nalgorithm. Motivated by this observation, we survey existing token-modification\nattacks and extract the components of each. We use an attack-independent\nframework to structure our survey which results in an effective categorisation\nof the field and an easy comparison of components. This survey aims to guide\nnew researchers to this field and spark further research into individual attack\ncomponents.\n","authors":["Tom Roth","Yansong Gao","Alsharif Abuadbba","Surya Nepal","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2103.00676v2.pdf","comment":"Version 2: updated"},{"id":"http://arxiv.org/abs/2308.03269v1","updated":"2023-08-07T03:19:59Z","published":"2023-08-07T03:19:59Z","title":"Simple Rule Injection for ComplEx Embeddings","summary":"  Recent works in neural knowledge graph inference attempt to combine logic\nrules with knowledge graph embeddings to benefit from prior knowledge. However,\nthey usually cannot avoid rule grounding, and injecting a diverse set of rules\nhas still not been thoroughly explored. In this work, we propose InjEx, a\nmechanism to inject multiple types of rules through simple constraints, which\ncapture definite Horn rules. To start, we theoretically prove that InjEx can\ninject such rules. Next, to demonstrate that InjEx infuses interpretable prior\nknowledge into the embedding space, we evaluate InjEx on both the knowledge\ngraph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.\nOur experimental results reveal that InjEx outperforms both baseline KGC models\nas well as specialized few-shot models while maintaining its scalability and\nefficiency.\n","authors":["Haodi Ma","Anthony Colas","Yuejie Wang","Ali Sadeghian","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03266v1","updated":"2023-08-07T03:12:27Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n  Effective Hotword Customization Ability","summary":"  Hotword customization is one of the important issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases. The past few years have seen both implicit and\nexplicit modeling strategies for ASR contextualization developed. While these\napproaches have performed adequately, they still exhibit certain shortcomings,\nsuch as instability in effectiveness, especially in non-autoregressive ASR\nmodels. In this paper we propose Semantic-augmented Contextual-Paraformer\n(SeACo-Paraformer) a novel NAR based ASR system with flexible and effective\nhotword customization ability. It combines the accuracy of the AED-based model,\nthe efficiency of the NAR model, and the excellent performance in\ncontextualization. In tens of thousands of hours industrial big data\nexperiments, our proposed model outperforms strong baselines in customization\nand general ASR tasks. Besides, we explore an efficient way to filter large\nscale incoming hotwords for further improvement.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v1.pdf","comment":"early draft"},{"id":"http://arxiv.org/abs/2305.02394v2","updated":"2023-08-07T03:07:59Z","published":"2023-05-03T19:29:26Z","title":"Defending against Insertion-based Textual Backdoor Attacks via\n  Attribution","summary":"  Textual backdoor attack, as a novel attack model, has been shown to be\neffective in adding a backdoor to the model during training. Defending against\nsuch backdoor attacks has become urgent and important. In this paper, we\npropose AttDef, an efficient attribution-based pipeline to defend against two\ninsertion-based poisoning attacks, BadNL and InSent. Specifically, we regard\nthe tokens with larger attribution scores as potential triggers since larger\nattribution words contribute more to the false prediction results and therefore\nare more likely to be poison triggers. Additionally, we further utilize an\nexternal pre-trained language model to distinguish whether input is poisoned or\nnot. We show that our proposed method can generalize sufficiently well in two\ncommon attack scenarios (poisoning training data and testing data), which\nconsistently improves previous methods. For instance, AttDef can successfully\nmitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%\n(3.99% up) under pre-training and post-training attack defense respectively,\nachieving the new state-of-the-art performance on prediction recovery over four\nbenchmark datasets.\n","authors":["Jiazhao Li","Zhuofeng Wu","Wei Ping","Chaowei Xiao","V. G. Vinod Vydiswaran"],"pdf_url":"https://arxiv.org/pdf/2305.02394v2.pdf","comment":"Findings of ACL 2023. Camera-ready version"},{"id":"http://arxiv.org/abs/2212.08632v2","updated":"2023-08-07T03:02:06Z","published":"2022-12-16T18:12:04Z","title":"Enhancing Multi-modal and Multi-hop Question Answering via Structured\n  Knowledge and Unified Retrieval-Generation","summary":"  Multi-modal multi-hop question answering involves answering a question by\nreasoning over multiple input sources from different modalities. Existing\nmethods often retrieve evidences separately and then use a language model to\ngenerate an answer based on the retrieved evidences, and thus do not adequately\nconnect candidates and are unable to model the interdependent relations during\nretrieval. Moreover, the pipelined approaches of retrieval and generation might\nresult in poor generation performance when retrieval performance is low. To\naddress these issues, we propose a Structured Knowledge and Unified\nRetrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion\nEncoder to align sources from different modalities using shared entities. It\nthen uses a unified Retrieval-Generation Decoder to integrate intermediate\nretrieval results for answer generation and also adaptively determine the\nnumber of retrieval steps. Extensive experiments on two representative\nmulti-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG\noutperforms the state-of-the-art models in both source retrieval and answer\ngeneration performance with fewer parameters. Our code is available at\nhttps://github.com/HITsz-TMG/SKURG.\n","authors":["Qian Yang","Qian Chen","Wen Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.08632v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.02180v2","updated":"2023-08-07T02:53:06Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zhang","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v2.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2308.03253v1","updated":"2023-08-07T02:18:23Z","published":"2023-08-07T02:18:23Z","title":"PaniniQA: Enhancing Patient Education Through Interactive Question\n  Answering","summary":"  Patient portal allows discharged patients to access their personalized\ndischarge instructions in electronic health records (EHRs). However, many\npatients have difficulty understanding or memorizing their discharge\ninstructions. In this paper, we present PaniniQA, a patient-centric interactive\nquestion answering system designed to help patients understand their discharge\ninstructions. PaniniQA first identifies important clinical content from\npatients' discharge instructions and then formulates patient-specific\neducational questions. In addition, PaniniQA is also equipped with answer\nverification functionality to provide timely feedback to correct patients'\nmisunderstandings. Our comprehensive automatic and human evaluation results\ndemonstrate our PaniniQA is capable of improving patients' mastery of their\nmedical instructions through effective interactions\n","authors":["Pengshan Cai","Zonghai Yao","Fei Liu","Dakuo Wang","Meghan Reilly","Huixue Zhou","Lingxi Li","Yi Cao","Alok Kapoor","Adarsha Bajracharya","Dan Berlowitz","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03253v1.pdf","comment":"Accepted to TACL 2023. This arXiv version is a pre-MIT Press\n  publication version"},{"id":"http://arxiv.org/abs/2308.03235v1","updated":"2023-08-07T01:10:50Z","published":"2023-08-07T01:10:50Z","title":"Analysis of the Evolution of Advanced Transformer-Based Language Models:\n  Experiments on Opinion Mining","summary":"  Opinion mining, also known as sentiment analysis, is a subfield of natural\nlanguage processing (NLP) that focuses on identifying and extracting subjective\ninformation in textual material. This can include determining the overall\nsentiment of a piece of text (e.g., positive or negative), as well as\nidentifying specific emotions or opinions expressed in the text, that involves\nthe use of advanced machine and deep learning techniques. Recently,\ntransformer-based language models make this task of human emotion analysis\nintuitive, thanks to the attention mechanism and parallel computation. These\nadvantages make such models very powerful on linguistic tasks, unlike recurrent\nneural networks that spend a lot of time on sequential processing, making them\nprone to fail when it comes to processing long text. The scope of our paper\naims to study the behaviour of the cutting-edge Transformer-based language\nmodels on opinion mining and provide a high-level comparison between them to\nhighlight their key particularities. Additionally, our comparative study shows\nleads and paves the way for production engineers regarding the approach to\nfocus on and is useful for researchers as it provides guidelines for future\nresearch subjects.\n","authors":["Nour Eddine Zekaoui","Siham Yousfi","Maryem Rhanoui","Mounia Mikram"],"pdf_url":"https://arxiv.org/pdf/2308.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03234v1","updated":"2023-08-07T01:03:04Z","published":"2023-08-07T01:03:04Z","title":"Exploring Automated Distractor and Feedback Generation for Math\n  Multiple-choice Questions via In-context Learning","summary":"  Multiple-choice questions (MCQs) are ubiquitous in almost all levels of\neducation since they are easy to administer, grade, and are a reliable format\nin both assessments and practices. An important aspect of MCQs is the\ndistractors, i.e., incorrect options that are designed to target specific\nmisconceptions or insufficient knowledge among students. To date, the task of\ncrafting high-quality distractors has largely remained a labor-intensive\nprocess for teachers and learning content designers, which has limited\nscalability. In this work, we explore the task of automated distractor and\ncorresponding feedback message generation in math MCQs using large language\nmodels. We establish a formulation of these two tasks and propose a simple,\nin-context learning-based solution. Moreover, we explore using two non-standard\nmetrics to evaluate the quality of the generated distractors and feedback\nmessages. We conduct extensive experiments on these tasks using a real-world\nMCQ dataset that contains student response information. Our findings suggest\nthat there is a lot of room for improvement in automated distractor and\nfeedback generation. We also outline several directions for future work\n","authors":["Hunter McNichols","Wanyong Feng","Jaewook Lee","Alexander Scarlatos","Digory Smith","Simon Woodhead","Andrew Lan"],"pdf_url":"https://arxiv.org/pdf/2308.03234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03958v1","updated":"2023-08-07T23:48:36Z","published":"2023-08-07T23:48:36Z","title":"Simple synthetic data reduces sycophancy in large language models","summary":"  Sycophancy is an undesirable behavior where models tailor their responses to\nfollow a human user's view even when that view is not objectively correct\n(e.g., adapting liberal views once a user reveals that they are liberal). In\nthis paper, we study the prevalence of sycophancy in language models and\npropose a simple synthetic-data intervention to reduce this behavior.\n  First, on a set of three sycophancy tasks (Perez et al., 2022) where models\nare asked for an opinion on statements with no correct answers (e.g.,\npolitics), we observe that both model scaling and instruction tuning\nsignificantly increase sycophancy for PaLM models up to 540B parameters.\nSecond, we extend sycophancy evaluations to simple addition statements that are\nobjectively incorrect, finding that despite knowing that these statements are\nwrong, language models will still agree with them if the user does as well.\n  To reduce sycophancy, we present a straightforward synthetic-data\nintervention that takes public NLP tasks and encourages models to be robust to\nuser opinions on these tasks. Adding these data in a lightweight finetuning\nstep can significantly reduce sycophantic behavior on held-out prompts. Code\nfor generating synthetic data for intervention can be found at\nhttps://github.com/google/sycophancy-intervention.\n","authors":["Jerry Wei","Da Huang","Yifeng Lu","Denny Zhou","Quoc V. Le"],"pdf_url":"https://arxiv.org/pdf/2308.03958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03929v1","updated":"2023-08-07T22:13:30Z","published":"2023-08-07T22:13:30Z","title":"Establishing Trust in ChatGPT BioMedical Generated Text: An\n  Ontology-Based Knowledge Graph to Validate Disease-Symptom Links","summary":"  Methods: Through an innovative approach, we construct ontology-based\nknowledge graphs from authentic medical literature and AI-generated content.\nOur goal is to distinguish factual information from unverified data. We\ncompiled two datasets: one from biomedical literature using a \"human disease\nand symptoms\" query, and another generated by ChatGPT, simulating articles.\nWith these datasets (PubMed and ChatGPT), we curated 10 sets of 250 abstracts\neach, selected randomly with a specific seed. Our method focuses on utilizing\ndisease ontology (DOID) and symptom ontology (SYMP) to build knowledge graphs,\nrobust mathematical models that facilitate unbiased comparisons. By employing\nour fact-checking algorithms and network centrality metrics, we conducted GPT\ndisease-symptoms link analysis to quantify the accuracy of factual knowledge\namid noise, hypotheses, and significant findings.\n  Results: The findings obtained from the comparison of diverse ChatGPT\nknowledge graphs with their PubMed counterparts revealed some interesting\nobservations. While PubMed knowledge graphs exhibit a wealth of disease-symptom\nterms, it is surprising to observe that some ChatGPT graphs surpass them in the\nnumber of connections. Furthermore, some GPT graphs are demonstrating supremacy\nof the centrality scores, especially for the overlapping nodes. This striking\ncontrast indicates the untapped potential of knowledge that can be derived from\nAI-generated content, awaiting verification. Out of all the graphs, the factual\nlink ratio between any two graphs reached its peak at 60%.\n  Conclusions: An intriguing insight from our findings was the striking number\nof links among terms in the knowledge graph generated from ChatGPT datasets,\nsurpassing some of those in its PubMed counterpart. This early discovery has\nprompted further investigation using universal network metrics to unveil the\nnew knowledge the links may hold.\n","authors":["Ahmed Abdeen Hamed","Alessandro Crimi","Magdalena M. Misiak","Byung Suk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.03929v1.pdf","comment":"7 Pages, 3 algorithms, 4 tables, and 7 figures"},{"id":"http://arxiv.org/abs/2308.02013v2","updated":"2023-08-07T21:34:44Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Ramesh","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v2.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2308.03917v1","updated":"2023-08-07T21:29:51Z","published":"2023-08-07T21:29:51Z","title":"Universal Automatic Phonetic Transcription into the International\n  Phonetic Alphabet","summary":"  This paper presents a state-of-the-art model for transcribing speech in any\nlanguage into the International Phonetic Alphabet (IPA). Transcription of\nspoken languages into IPA is an essential yet time-consuming process in\nlanguage documentation, and even partially automating this process has the\npotential to drastically speed up the documentation of endangered languages.\nLike the previous best speech-to-IPA model (Wav2Vec2Phoneme), our model is\nbased on wav2vec 2.0 and is fine-tuned to predict IPA from audio input. We use\ntraining data from seven languages from CommonVoice 11.0, transcribed into IPA\nsemi-automatically. Although this training dataset is much smaller than\nWav2Vec2Phoneme's, its higher quality lets our model achieve comparable or\nbetter results. Furthermore, we show that the quality of our universal\nspeech-to-IPA models is close to that of human annotators.\n","authors":["Chihiro Taguchi","Yusuke Sakai","Parisa Haghani","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2308.03917v1.pdf","comment":"5 pages, 7 tables"},{"id":"http://arxiv.org/abs/2308.03905v1","updated":"2023-08-07T20:43:42Z","published":"2023-08-07T20:43:42Z","title":"Intelligent Assistant Language Understanding On Device","summary":"  It has recently become feasible to run personal digital assistants on phones\nand other personal devices. In this paper we describe a design for a natural\nlanguage understanding system that runs on device. In comparison to a\nserver-based assistant, this system is more private, more reliable, faster,\nmore expressive, and more accurate. We describe what led to key choices about\narchitecture and technologies. For example, some approaches in the dialog\nsystems literature are difficult to maintain over time in a deployment setting.\nWe hope that sharing learnings from our practical experiences may help inform\nfuture work in the research community.\n","authors":["Cecilia Aas","Hisham Abdelsalam","Irina Belousova","Shruti Bhargava","Jianpeng Cheng","Robert Daland","Joris Driesen","Federico Flego","Tristan Guigue","Anders Johannsen","Partha Lal","Jiarui Lu","Joel Ruben Antony Moniz","Nathan Perkins","Dhivya Piraviperumal","Stephen Pulman","Diarmuid Ó Séaghdha","David Q. Sun","John Torr","Marco Del Vecchio","Jay Wacker","Jason D. Williams","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03891v1","updated":"2023-08-07T19:50:59Z","published":"2023-08-07T19:50:59Z","title":"A Cross-Domain Evaluation of Approaches for Causal Knowledge Extraction","summary":"  Causal knowledge extraction is the task of extracting relevant causes and\neffects from text by detecting the causal relation. Although this task is\nimportant for language understanding and knowledge discovery, recent works in\nthis domain have largely focused on binary classification of a text segment as\ncausal or non-causal. In this regard, we perform a thorough analysis of three\nsequence tagging models for causal knowledge extraction and compare it with a\nspan based approach to causality extraction. Our experiments show that\nembeddings from pre-trained language models (e.g. BERT) provide a significant\nperformance boost on this task compared to previous state-of-the-art models\nwith complex architectures. We observe that span based models perform better\nthan simple sequence tagging models based on BERT across all 4 data sets from\ndiverse domains with different types of cause-effect phrases.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.03891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03883v1","updated":"2023-08-07T19:26:09Z","published":"2023-08-07T19:26:09Z","title":"Generative Benchmark Creation for Table Union Search","summary":"  Data management has traditionally relied on synthetic data generators to\ngenerate structured benchmarks, like the TPC suite, where we can control\nimportant parameters like data size and its distribution precisely. These\nbenchmarks were central to the success and adoption of database management\nsystems. But more and more, data management problems are of a semantic nature.\nAn important example is finding tables that can be unioned. While any two\ntables with the same cardinality can be unioned, table union search is the\nproblem of finding tables whose union is semantically coherent. Semantic\nproblems cannot be benchmarked using synthetic data. Our current methods for\ncreating benchmarks involve the manual curation and labeling of real data.\nThese methods are not robust or scalable and perhaps more importantly, it is\nnot clear how robust the created benchmarks are. We propose to use generative\nAI models to create structured data benchmarks for table union search. We\npresent a novel method for using generative models to create tables with\nspecified properties. Using this method, we create a new benchmark containing\npairs of tables that are both unionable and non-unionable but related. We\nthoroughly evaluate recent existing table union search methods over existing\nbenchmarks and our new benchmark. We also present and evaluate a new table\nsearch methods based on recent large language models over all benchmarks. We\nshow that the new benchmark is more challenging for all methods than\nhand-curated benchmarks, specifically, the top-performing method achieves a\nMean Average Precision of around 60%, over 30% less than its performance on\nexisting manually created benchmarks. We examine why this is the case and show\nthat the new benchmark permits more detailed analysis of methods, including a\nstudy of both false positives and false negatives that were not possible with\nexisting benchmarks.\n","authors":["Koyena Pal","Aamod Khatiwada","Roee Shraga","Renée J. Miller"],"pdf_url":"https://arxiv.org/pdf/2308.03883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03866v1","updated":"2023-08-07T18:27:54Z","published":"2023-08-07T18:27:54Z","title":"Trusting Language Models in Education","summary":"  Language Models are being widely used in Education. Even though modern deep\nlearning models achieve very good performance on question-answering tasks,\nsometimes they make errors. To avoid misleading students by showing wrong\nanswers, it is important to calibrate the confidence - that is, the prediction\nprobability - of these models. In our work, we propose to use an XGBoost on top\nof BERT to output the corrected probabilities, using features based on the\nattention mechanism. Our hypothesis is that the level of uncertainty contained\nin the flow of attention is related to the quality of the model's response\nitself.\n","authors":["Jogi Suda Neto","Li Deng","Thejaswi Raya","Reza Shahbazi","Nick Liu","Adhitya Venkatesh","Miral Shah","Neeru Khosla","Rodrigo Capobianco Guido"],"pdf_url":"https://arxiv.org/pdf/2308.03866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03864v1","updated":"2023-08-07T18:25:00Z","published":"2023-08-07T18:25:00Z","title":"Storyfier: Exploring Vocabulary Learning Support with Text Generation\n  Models","summary":"  Vocabulary learning support tools have widely exploited existing materials,\ne.g., stories or video clips, as contexts to help users memorize each target\nword. However, these tools could not provide a coherent context for any target\nwords of learners' interests, and they seldom help practice word usage. In this\npaper, we work with teachers and students to iteratively develop Storyfier,\nwhich leverages text generation models to enable learners to read a generated\nstory that covers any target words, conduct a story cloze test, and use these\nwords to write a new story with adaptive AI assistance. Our within-subjects\nstudy (N=28) shows that learners generally favor the generated stories for\nconnecting target words and writing assistance for easing their learning\nworkload. However, in the read-cloze-write learning sessions, participants\nusing Storyfier perform worse in recalling and using target words than learning\nwith a baseline tool without our AI features. We discuss insights into\nsupporting learning tasks with generative models.\n","authors":["Zhenhui Peng","Xingbo Wang","Qiushi Han","Junkai Zhu","Xiaojuan Ma","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2308.03864v1.pdf","comment":"To appear at the 2023 ACM Symposium on User Interface Software and\n  Technology (UIST); 16 pages (7 figures, 23 tables)"},{"id":"http://arxiv.org/abs/2308.03853v1","updated":"2023-08-07T18:03:10Z","published":"2023-08-07T18:03:10Z","title":"Extracting detailed oncologic history and treatment plan from medical\n  oncology notes with large language models","summary":"  Both medical care and observational studies in oncology require a thorough\nunderstanding of a patient's disease progression and treatment history, often\nelaborately documented in clinical notes. Despite their vital role, no current\noncology information representation and annotation schema fully encapsulates\nthe diversity of information recorded within these notes. Although large\nlanguage models (LLMs) have recently exhibited impressive performance on\nvarious medical natural language processing tasks, due to the current lack of\ncomprehensively annotated oncology datasets, an extensive evaluation of LLMs in\nextracting and reasoning with the complex rhetoric in oncology notes remains\nunderstudied. We developed a detailed schema for annotating textual oncology\ninformation, encompassing patient characteristics, tumor characteristics,\ntests, treatments, and temporality. Using a corpus of 10 de-identified breast\ncancer progress notes at University of California, San Francisco, we applied\nthis schema to assess the abilities of three recently-released LLMs (GPT-4,\nGPT-3.5-turbo, and FLAN-UL2) to perform zero-shot extraction of detailed\noncological history from two narrative sections of clinical progress notes. Our\nteam annotated 2750 entities, 2874 modifiers, and 1623 relationships. The GPT-4\nmodel exhibited overall best performance, with an average BLEU score of 0.69,\nan average ROUGE score of 0.72, and an average accuracy of 67% on complex tasks\n(expert manual evaluation). Notably, it was proficient in tumor characteristic\nand medication extraction, and demonstrated superior performance in inferring\nsymptoms due to cancer and considerations of future medications. The analysis\ndemonstrates that GPT-4 is potentially already usable to extract important\nfacts from cancer progress notes needed for clinical research, complex\npopulation management, and documenting quality patient care.\n","authors":["Madhumita Sushil","Vanessa E. Kennedy","Brenda Y. Miao","Divneet Mandair","Travis Zack","Atul J. Butte"],"pdf_url":"https://arxiv.org/pdf/2308.03853v1.pdf","comment":"Source code available at:\n  https://github.com/MadhumitaSushil/OncLLMExtraction"},{"id":"http://arxiv.org/abs/2308.03311v1","updated":"2023-08-07T05:40:01Z","published":"2023-08-07T05:40:01Z","title":"CrossTalk: Intelligent Substrates for Language-Oriented Interaction in\n  Video-Based Communication and Collaboration","summary":"  Despite the advances and ubiquity of digital communication media such as\nvideoconferencing and virtual reality, they remain oblivious to the rich\nintentions expressed by users. Beyond transmitting audio, videos, and messages,\nwe envision digital communication media as proactive facilitators that can\nprovide unobtrusive assistance to enhance communication and collaboration.\nInformed by the results of a formative study, we propose three key design\nconcepts to explore the systematic integration of intelligence into\ncommunication and collaboration, including the panel substrate, language-based\nintent recognition, and lightweight interaction techniques. We developed\nCrossTalk, a videoconferencing system that instantiates these concepts, which\nwas found to enable a more fluid and flexible communication and collaboration\nexperience.\n","authors":["Haijun Xia","Tony Wang","Aditya Gunturu","Peiling Jiang","William Duan","Xiaoshuo Yao"],"pdf_url":"https://arxiv.org/pdf/2308.03311v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.03757v1","updated":"2023-08-07T17:59:59Z","published":"2023-08-07T17:59:59Z","title":"3D Motion Magnification: Visualizing Subtle Motions with Time Varying\n  Radiance Fields","summary":"  Motion magnification helps us visualize subtle, imperceptible motion.\nHowever, prior methods only work for 2D videos captured with a fixed camera. We\npresent a 3D motion magnification method that can magnify subtle motions from\nscenes captured by a moving camera, while supporting novel view rendering. We\nrepresent the scene with time-varying radiance fields and leverage the Eulerian\nprinciple for motion magnification to extract and amplify the variation of the\nembedding of a fixed point over time. We study and validate our proposed\nprinciple for 3D motion magnification using both implicit and tri-plane-based\nradiance fields as our underlying 3D scene representation. We evaluate the\neffectiveness of our method on both synthetic and real-world scenes captured\nunder various camera setups.\n","authors":["Brandon Y. Feng","Hadi Alzayer","Michael Rubinstein","William T. Freeman","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03757v1.pdf","comment":"ICCV 2023. See the project page at\n  https://3d-motion-magnification.github.io"},{"id":"http://arxiv.org/abs/2209.11359v4","updated":"2023-08-07T17:59:53Z","published":"2022-09-23T01:09:06Z","title":"CUTS: A Fully Unsupervised Framework for Medical Image Segmentation","summary":"  In this work we introduce CUTS (Contrastive and Unsupervised Training for\nSegmentation), a fully unsupervised deep learning framework for medical image\nsegmentation to better utilize the vast majority of imaging data that is not\nlabeled or annotated. We utilize self-supervision from pixels and their local\nneighborhoods in the images themselves. Our unsupervised approach optimizes a\ntraining objective that leverages concepts from contrastive learning and\nautoencoding. Our framework segments medical images with a novel two-stage\napproach without relying on any labeled data at any stage. The first stage\ninvolves the creation of a \"pixel-centered patch\" that embeds every pixel along\nwith its surrounding patch, using a vector representation in a high-dimensional\nlatent embedding space. The second stage utilizes diffusion condensation, a\nmulti-scale topological data analysis approach, to dynamically coarse-grain\nthese embedding vectors at all levels of granularity. The final outcome is a\nseries of coarse-to-fine segmentations that highlight image structures at\nvarious scales. In this work, we show successful multi-scale segmentation on\nnatural images, retinal fundus images, and brain MRI images. Our framework\ndelineates structures and patterns at different scales which, in the cases of\nmedical images, may carry distinct information relevant to clinical\ninterpretation. Quantitatively, our framework demonstrates improvements ranging\nfrom 10% to 200% on dice coefficient and Hausdorff distance compared to\nexisting unsupervised methods across three medical image datasets. As we tackle\nthe problem of segmenting medical images at multiple meaningful granularities\nwithout relying on any label, we hope to demonstrate the possibility to\ncircumvent tedious and repetitive manual annotations in future practice.\n","authors":["Chen Liu","Matthew Amodio","Liangbo L. Shen","Feng Gao","Arman Avesta","Sanjay Aneja","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2209.11359v4.pdf","comment":"Included new dataset. Ensured evaluation consistency among competing\n  methods"},{"id":"http://arxiv.org/abs/2308.03755v1","updated":"2023-08-07T17:59:48Z","published":"2023-08-07T17:59:48Z","title":"FSD V2: Improving Fully Sparse 3D Object Detection with Virtual Voxels","summary":"  LiDAR-based fully sparse architecture has garnered increasing attention.\nFSDv1 stands out as a representative work, achieving impressive efficacy and\nefficiency, albeit with intricate structures and handcrafted designs. In this\npaper, we present FSDv2, an evolution that aims to simplify the previous FSDv1\nwhile eliminating the inductive bias introduced by its handcrafted\ninstance-level representation, thus promoting better general applicability. To\nthis end, we introduce the concept of \\textbf{virtual voxels}, which takes over\nthe clustering-based instance segmentation in FSDv1. Virtual voxels not only\naddress the notorious issue of the Center Feature Missing problem in fully\nsparse detectors but also endow the framework with a more elegant and\nstreamlined approach. Consequently, we develop a suite of components to\ncomplement the virtual voxel concept, including a virtual voxel encoder, a\nvirtual voxel mixer, and a virtual voxel assignment strategy. Through empirical\nvalidation, we demonstrate that the virtual voxel mechanism is functionally\nsimilar to the handcrafted clustering in FSDv1 while being more general. We\nconduct experiments on three large-scale datasets: Waymo Open Dataset,\nArgoverse 2 dataset, and nuScenes dataset. Our results showcase\nstate-of-the-art performance on all three datasets, highlighting the\nsuperiority of FSDv2 in long-range scenarios and its general applicability to\nachieve competitive performance across diverse scenarios. Moreover, we provide\ncomprehensive experimental analysis to elucidate the workings of FSDv2. To\nfoster reproducibility and further research, we have open-sourced FSDv2 at\nhttps://github.com/tusen-ai/SST.\n","authors":["Lue Fan","Feng Wang","Naiyan Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09027v3","updated":"2023-08-07T17:56:54Z","published":"2022-11-12T10:12:17Z","title":"LLEDA -- Lifelong Self-Supervised Domain Adaptation","summary":"  Humans and animals have the ability to continuously learn new information\nover their lifetime without losing previously acquired knowledge. However,\nartificial neural networks struggle with this due to new information\nconflicting with old knowledge, resulting in catastrophic forgetting. The\ncomplementary learning systems (CLS) theory suggests that the interplay between\nhippocampus and neocortex systems enables long-term and efficient learning in\nthe mammalian brain, with memory replay facilitating the interaction between\nthese two systems to reduce forgetting. The proposed Lifelong Self-Supervised\nDomain Adaptation (LLEDA) framework draws inspiration from the CLS theory and\nmimics the interaction between two networks: a DA network inspired by the\nhippocampus that quickly adjusts to changes in data distribution and an SSL\nnetwork inspired by the neocortex that gradually learns domain-agnostic general\nrepresentations. LLEDA's latent replay technique facilitates communication\nbetween these two networks by reactivating and replaying the past memory latent\nrepresentations to stabilise long-term generalisation and retention without\ninterfering with the previously learned information. Extensive experiments\ndemonstrate that the proposed method outperforms several other methods\nresulting in a long-term adaptation while being less prone to catastrophic\nforgetting when transferred to new domains.\n","authors":["Mamatha Thota","Dewei Yi","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2211.09027v3.pdf","comment":"19 pages, 6 figures, 6 tables; V2 added more experiments on more\n  domains and fixed typos"},{"id":"http://arxiv.org/abs/2308.03747v1","updated":"2023-08-07T17:53:21Z","published":"2023-08-07T17:53:21Z","title":"Mask Frozen-DETR: High Quality Instance Segmentation with One GPU","summary":"  In this paper, we aim to study how to build a strong instance segmenter with\nminimal training time and GPUs, as opposed to the majority of current\napproaches that pursue more accurate instance segmenter by building more\nadvanced frameworks at the cost of longer training time and higher GPU\nrequirements. To achieve this, we introduce a simple and general framework,\ntermed Mask Frozen-DETR, which can convert any existing DETR-based object\ndetection model into a powerful instance segmentation model. Our method only\nrequires training an additional lightweight mask network that predicts instance\nmasks within the bounding boxes given by a frozen DETR-based object detector.\nRemarkably, our method outperforms the state-of-the-art instance segmentation\nmethod Mask DINO in terms of performance on the COCO test-dev split (55.3% vs.\n54.7%) while being over 10X times faster to train. Furthermore, all of our\nexperiments can be trained using only one Tesla V100 GPU with 16 GB of memory,\ndemonstrating the significant efficiency of our proposed framework.\n","authors":["Zhanhao Liang","Yuhui Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.03747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01390v2","updated":"2023-08-07T17:53:09Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03729v1","updated":"2023-08-07T17:17:05Z","published":"2023-08-07T17:17:05Z","title":"Tiny LVLM-eHub: Early Multimodal Experiments with Bard","summary":"  Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated\nsignificant progress in tackling complex multimodal tasks. Among these\ncutting-edge developments, Google's Bard stands out for its remarkable\nmultimodal capabilities, promoting comprehensive comprehension and reasoning\nacross various domains. This work presents an early and holistic evaluation of\nLVLMs' multimodal abilities, with a particular focus on Bard, by proposing a\nlightweight variant of LVLM-eHub, named Tiny LVLM-eHub. In comparison to the\nvanilla version, Tiny LVLM-eHub possesses several appealing properties.\nFirstly, it provides a systematic assessment of six categories of multimodal\ncapabilities, including visual perception, visual knowledge acquisition, visual\nreasoning, visual commonsense, object hallucination, and embodied intelligence,\nthrough quantitative evaluation of $42$ standard text-related visual\nbenchmarks. Secondly, it conducts an in-depth analysis of LVLMs' predictions\nusing the ChatGPT Ensemble Evaluation (CEE), which leads to a robust and\naccurate evaluation and exhibits improved alignment with human evaluation\ncompared to the word matching approach. Thirdly, it comprises a mere $2.1$K\nimage-text pairs, facilitating ease of use for practitioners to evaluate their\nown offline LVLMs. Through extensive experimental analysis, this study\ndemonstrates that Bard outperforms previous LVLMs in most multimodal\ncapabilities except object hallucination, to which Bard is still susceptible.\nTiny LVLM-eHub serves as a baseline evaluation for various LVLMs and encourages\ninnovative strategies aimed at advancing multimodal techniques. Our project is\npublicly available at \\url{https://github.com/OpenGVLab/Multi-Modality-Arena}.\n","authors":["Wenqi Shao","Yutao Hu","Peng Gao","Meng Lei","Kaipeng Zhang","Fanqing Meng","Peng Xu","Siyuan Huang","Hongsheng Li","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.03729v1.pdf","comment":"24 pages, 24 figures, 7 Tables. Project Page:\n  http://lvlm-ehub.opengvlab.com/"},{"id":"http://arxiv.org/abs/2308.03726v1","updated":"2023-08-07T17:12:54Z","published":"2023-08-07T17:12:54Z","title":"AdaptiveSAM: Towards Efficient Tuning of SAM for Surgical Scene\n  Segmentation","summary":"  Segmentation is a fundamental problem in surgical scene analysis using\nartificial intelligence. However, the inherent data scarcity in this domain\nmakes it challenging to adapt traditional segmentation techniques for this\ntask. To tackle this issue, current research employs pretrained models and\nfinetunes them on the given data. Even so, these require training deep networks\nwith millions of parameters every time new data becomes available. A recently\npublished foundation model, Segment-Anything (SAM), generalizes well to a large\nvariety of natural images, hence tackling this challenge to a reasonable\nextent. However, SAM does not generalize well to the medical domain as is\nwithout utilizing a large amount of compute resources for fine-tuning and using\ntask-specific prompts. Moreover, these prompts are in the form of\nbounding-boxes or foreground/background points that need to be annotated\nexplicitly for every image, making this solution increasingly tedious with\nhigher data size. In this work, we propose AdaptiveSAM - an adaptive\nmodification of SAM that can adjust to new datasets quickly and efficiently,\nwhile enabling text-prompted segmentation. For finetuning AdaptiveSAM, we\npropose an approach called bias-tuning that requires a significantly smaller\nnumber of trainable parameters than SAM (less than 2\\%). At the same time,\nAdaptiveSAM requires negligible expert intervention since it uses free-form\ntext as prompt and can segment the object of interest with just the label name\nas prompt. Our experiments show that AdaptiveSAM outperforms current\nstate-of-the-art methods on various medical imaging datasets including surgery,\nultrasound and X-ray. Code is available at\nhttps://github.com/JayParanjape/biastuning\n","authors":["Jay N. Paranjape","Nithin Gopalakrishnan Nair","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2308.03726v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.03725v1","updated":"2023-08-07T17:07:48Z","published":"2023-08-07T17:07:48Z","title":"Efficient Temporal Sentence Grounding in Videos with Multi-Teacher\n  Knowledge Distillation","summary":"  Temporal Sentence Grounding in Videos (TSGV) aims to detect the event\ntimestamps described by the natural language query from untrimmed videos. This\npaper discusses the challenge of achieving efficient computation in TSGV models\nwhile maintaining high performance. Most existing approaches exquisitely design\ncomplex architectures to improve accuracy with extra layers and loss, suffering\nfrom inefficiency and heaviness. Although some works have noticed that, they\nonly make an issue of feature fusion layers, which can hardly enjoy the\nhighspeed merit in the whole clunky network. To tackle this problem, we propose\na novel efficient multi-teacher model (EMTM) based on knowledge distillation to\ntransfer diverse knowledge from both heterogeneous and isomorphic networks.\nSpecifically, We first unify different outputs of the heterogeneous models into\none single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire\nhigh-quality integrated soft labels from multiple teachers. After that, the KAU\nmodule leverages the multi-scale video and global query information to\nadaptively determine the weights of different teachers. A Shared Encoder\nstrategy is then proposed to solve the problem that the student shallow layers\nhardly benefit from teachers, in which an isomorphic teacher is collaboratively\ntrained with the student to align their hidden states. Extensive experimental\nresults on three popular TSGV benchmarks demonstrate that our method is both\neffective and efficient without bells and whistles.\n","authors":["Renjie Liang","Yiming Yang","Hui Lu","Li Li"],"pdf_url":"https://arxiv.org/pdf/2308.03725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03723v1","updated":"2023-08-07T16:58:48Z","published":"2023-08-07T16:58:48Z","title":"Dimensionality Reduction for Improving Out-of-Distribution Detection in\n  Medical Image Segmentation","summary":"  Clinically deployed segmentation models are known to fail on data outside of\ntheir training distribution. As these models perform well on most cases, it is\nimperative to detect out-of-distribution (OOD) images at inference to protect\nagainst automation bias. This work applies the Mahalanobis distance post hoc to\nthe bottleneck features of a Swin UNETR model that segments the liver on\nT1-weighted magnetic resonance imaging. By reducing the dimensions of the\nbottleneck features with principal component analysis, OOD images were detected\nwith high performance and minimal computational load.\n","authors":["McKell Woodland","Nihil Patel","Mais Al Taie","Joshua P. Yung","Tucker J. Netherton","Ankit B. Patel","Kristy K. Brock"],"pdf_url":"https://arxiv.org/pdf/2308.03723v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections. The Version of Record of this contribution will\n  be published in the Proceedings of Uncertainty for Safe Utilization of\n  Machine Learning in Medical Imaging (5th International Workshop) - Held in\n  conjunction with MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03718v1","updated":"2023-08-07T16:43:46Z","published":"2023-08-07T16:43:46Z","title":"SEM-GAT: Explainable Semantic Pose Estimation using Learned Graph\n  Attention","summary":"  This paper proposes a GNN-based method for exploiting semantics and local\ngeometry to guide the identification of reliable pointcloud registration\ncandidates. Semantic and morphological features of the environment serve as key\nreference points for registration, enabling accurate lidar-based pose\nestimation. Our novel lightweight static graph structure informs our\nattention-based keypoint node aggregation GNN network by identifying semantic\ninstance-based relationships, acting as inductive bias to significantly reduce\nthe computational burden of pointcloud registration. By connecting candidate\nnodes and exploiting cross-graph attention, we identify confidence scores for\nall potential registration correspondences, estimating the displacement between\npointcloud scans. Our pipeline enables introspective analysis of the model's\nperformance by correlating it with the individual contributions of local\nstructures in the environment, providing valuable insights into the system's\nbehaviour. We test our method on the KITTI odometry dataset, achieving\ncompetitive accuracy compared to benchmark methods and a higher track\nsmoothness while relying on significantly fewer network parameters.\n","authors":["Efimia Panagiotaki","Daniele De Martini","Georgi Pramatarov","Matthew Gadd","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.03718v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.03717v1","updated":"2023-08-07T16:40:19Z","published":"2023-08-07T16:40:19Z","title":"Automated Real Time Delineation of Supraclavicular Brachial Plexus in\n  Neck Ultrasonography Videos: A Deep Learning Approach","summary":"  Peripheral nerve blocks are crucial to treatment of post-surgical pain and\nare associated with reduction in perioperative opioid use and hospital stay.\nAccurate interpretation of sono-anatomy is critical for the success of\nultrasound (US) guided peripheral nerve blocks and can be challenging to the\nnew operators. This prospective study enrolled 227 subjects who were\nsystematically scanned for supraclavicular and interscalene brachial plexus in\nvarious settings using three different US machines to create a dataset of 227\nunique videos. In total, 41,000 video frames were annotated by experienced\nanaesthesiologists using partial automation with object tracking and active\ncontour algorithms. Four baseline neural network models were trained on the\ndataset and their performance was evaluated for object detection and\nsegmentation tasks. Generalizability of the best suited model was then tested\non the datasets constructed from separate US scanners with and without\nfine-tuning. The results demonstrate that deep learning models can be leveraged\nfor real time segmentation of supraclavicular brachial plexus in neck\nultrasonography videos with high accuracy and reliability. Model was also\ntested for its ability to differentiate between supraclavicular and adjoining\ninterscalene brachial plexus. The entire dataset has been released publicly for\nfurther study by the research community.\n","authors":["Abhay Tyagi","Abhishek Tyagi","Manpreet Kaur","Jayanthi Sivaswami","Richa Aggarwal","Kapil Dev Soni","Anjan Trikha"],"pdf_url":"https://arxiv.org/pdf/2308.03717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v1","updated":"2023-08-07T16:31:38Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach human-level\naccuracy on ImageNet. Human-level competence is thus achievable for a\nfundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v1.pdf","comment":"7 pages, 3 figures, 2 tables; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.03709v1","updated":"2023-08-07T16:30:24Z","published":"2023-08-07T16:30:24Z","title":"Prototype Learning for Out-of-Distribution Polyp Segmentation","summary":"  Existing polyp segmentation models from colonoscopy images often fail to\nprovide reliable segmentation results on datasets from different centers,\nlimiting their applicability. Our objective in this study is to create a robust\nand well-generalized segmentation model named PrototypeLab that can assist in\npolyp segmentation. To achieve this, we incorporate various lighting modes such\nas White light imaging (WLI), Blue light imaging (BLI), Linked color imaging\n(LCI), and Flexible spectral imaging color enhancement (FICE) into our new\nsegmentation model, that learns to create prototypes for each class of object\npresent in the images. These prototypes represent the characteristic features\nof the objects, such as their shape, texture, color. Our model is designed to\nperform effectively on out-of-distribution (OOD) datasets from multiple\ncenters. We first generate a coarse mask that is used to learn prototypes for\nthe main object class, which are then employed to generate the final\nsegmentation mask. By using prototypes to represent the main class, our\napproach handles the variability present in the medical images and generalize\nwell to new data since prototype capture the underlying distribution of the\ndata. PrototypeLab offers a promising solution with a dice coefficient of\n$\\geq$ 90\\% and mIoU $\\geq$ 85\\% with a near real-time processing speed for\npolyp segmentation. It achieved superior performance on OOD datasets compared\nto 16 state-of-the-art image segmentation architectures, potentially improving\nclinical outcomes. Codes are available at\nhttps://github.com/xxxxx/PrototypeLab.\n","authors":["Nikhil Kumar Tomar","Debesh Jha","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2308.03709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03703v1","updated":"2023-08-07T16:22:47Z","published":"2023-08-07T16:22:47Z","title":"Video-based Person Re-identification with Long Short-Term Representation\n  Learning","summary":"  Video-based person Re-Identification (V-ReID) aims to retrieve specific\npersons from raw videos captured by non-overlapped cameras. As a fundamental\ntask, it spreads many multimedia and computer vision applications. However, due\nto the variations of persons and scenes, there are still many obstacles that\nmust be overcome for high performance. In this work, we notice that both the\nlong-term and short-term information of persons are important for robust video\nrepresentations. Thus, we propose a novel deep learning framework named Long\nShort-Term Representation Learning (LSTRL) for effective V-ReID. More\nspecifically, to extract long-term representations, we propose a\nMulti-granularity Appearance Extractor (MAE), in which four granularity\nappearances are effectively captured across multiple frames. Meanwhile, to\nextract short-term representations, we propose a Bi-direction Motion Estimator\n(BME), in which reciprocal motion information is efficiently extracted from\nconsecutive frames. The MAE and BME are plug-and-play and can be easily\ninserted into existing networks for efficient feature learning. As a result,\nthey significantly improve the feature representation ability for V-ReID.\nExtensive experiments on three widely used benchmarks show that our proposed\napproach can deliver better performances than most state-of-the-arts.\n","authors":["Xuehu Liu","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03703v1.pdf","comment":"This work is accepted by ICIG2023, including 13 pages, 5 figures and\n  5 tables. Modifications may be performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03698v1","updated":"2023-08-07T16:14:27Z","published":"2023-08-07T16:14:27Z","title":"Screen-based 3D Subjective Experiment Software","summary":"  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn\nconsiderable efforts from academia and industry to assess their perceptual\nquality by conducting subjective experiments. However, lacking a handy software\nfor 3D subjective experiments complicates the construction of 3D graphics\nquality assessment datasets, thus hindering the prosperity of relevant fields.\nIn this paper, we develop a powerful platform with which users can flexibly\ndesign their 3D subjective methodologies and build high-quality datasets,\neasing a broad spectrum of 3D graphics subjective quality study. To accurately\nillustrate the perceptual quality differences of 3D stimuli, our software can\nsimultaneously render the source stimulus and impaired stimulus and allows both\nstimuli to respond synchronously to viewer interactions. Compared with amateur\n3D visualization tool-based or image/video rendering-based schemes, our\napproach embodies typical 3D applications while minimizing cognitive overload\nduring subjective experiments. We organized a subjective experiment involving\n40 participants to verify the validity of the proposed software. Experimental\nanalyses demonstrate that subjective tests on our software can produce\nreasonable subjective quality scores of 3D models. All resources in this paper\ncan be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.\n","authors":["Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2308.03698v1.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03685v1","updated":"2023-08-07T16:00:22Z","published":"2023-08-07T16:00:22Z","title":"Learning Concise and Descriptive Attributes for Visual Recognition","summary":"  Recent advances in foundation models present new opportunities for\ninterpretable visual recognition -- one can first query Large Language Models\n(LLMs) to obtain a set of attributes that describe each class, then apply\nvision-language models to classify images via these attributes. Pioneering work\nshows that querying thousands of attributes can achieve performance competitive\nwith image features. However, our further investigation on 8 datasets reveals\nthat LLM-generated attributes in a large quantity perform almost the same as\nrandom words. This surprising finding suggests that significant noise may be\npresent in these attributes. We hypothesize that there exist subsets of\nattributes that can maintain the classification performance with much smaller\nsizes, and propose a novel learning-to-search method to discover those concise\nsets of attributes. As a result, on the CUB dataset, our method achieves\nperformance close to that of massive LLM-generated attributes (e.g., 10k\nattributes for CUB), yet using only 32 attributes in total to distinguish 200\nbird species. Furthermore, our new paradigm demonstrates several additional\nbenefits: higher interpretability and interactivity for humans, and the ability\nto summarize knowledge for a recognition task.\n","authors":["An Yan","Yu Wang","Yiwu Zhong","Chengyu Dong","Zexue He","Yujie Lu","William Wang","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2308.03685v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03670v1","updated":"2023-08-07T15:44:58Z","published":"2023-08-07T15:44:58Z","title":"Improving FHB Screening in Wheat Breeding Using an Efficient Transformer\n  Model","summary":"  Fusarium head blight is a devastating disease that causes significant\neconomic losses annually on small grains. Efficiency, accuracy, and timely\ndetection of FHB in the resistance screening are critical for wheat and barley\nbreeding programs. In recent years, various image processing techniques have\nbeen developed using supervised machine learning algorithms for the early\ndetection of FHB. The state-of-the-art convolutional neural network-based\nmethods, such as U-Net, employ a series of encoding blocks to create a local\nrepresentation and a series of decoding blocks to capture the semantic\nrelations. However, these methods are not often capable of long-range modeling\ndependencies inside the input data, and their ability to model multi-scale\nobjects with significant variations in texture and shape is limited. Vision\ntransformers as alternative architectures with innate global self-attention\nmechanisms for sequence-to-sequence prediction, due to insufficient low-level\ndetails, may also limit localization capabilities. To overcome these\nlimitations, a new Context Bridge is proposed to integrate the local\nrepresentation capability of the U-Net network in the transformer model. In\naddition, the standard attention mechanism of the original transformer is\nreplaced with Efficient Self-attention, which is less complicated than other\nstate-of-the-art methods. To train the proposed network, 12,000 wheat images\nfrom an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were\ncaptured. In addition to healthy and unhealthy plants, these images encompass\nvarious stages of the disease. A team of expert pathologists annotated the\nimages for training and evaluating the developed model. As a result, the\neffectiveness of the transformer-based method for FHB-disease detection,\nthrough extensive experiments across typical tasks for plant image\nsegmentation, is demonstrated.\n","authors":["Babak Azad","Ahmed Abdalla","Kwanghee Won","Ali Mirzakhani Nafchi"],"pdf_url":"https://arxiv.org/pdf/2308.03670v1.pdf","comment":"10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual\n  International Meeting conference in Omaha, Nebraska. Also available at\n  https://elibrary.asabe.org/abstract.asp?aid=54149"},{"id":"http://arxiv.org/abs/2307.16177v2","updated":"2023-08-07T15:22:37Z","published":"2023-07-30T09:15:38Z","title":"Fusing VHR Post-disaster Aerial Imagery and LiDAR Data for Roof\n  Classification in the Caribbean using CNNs","summary":"  Accurate and up-to-date information on building characteristics is essential\nfor vulnerability assessment; however, the high costs and long timeframes\nassociated with conducting traditional field surveys can be an obstacle to\nobtaining critical exposure datasets needed for disaster risk management. In\nthis work, we leverage deep learning techniques for the automated\nclassification of roof characteristics from very high-resolution orthophotos\nand airborne LiDAR data obtained in Dominica following Hurricane Maria in 2017.\nWe demonstrate that the fusion of multimodal earth observation data performs\nbetter than using any single data source alone. Using our proposed methods, we\nachieve F1 scores of 0.93 and 0.92 for roof type and roof material\nclassification, respectively. This work is intended to help governments produce\nmore timely building information to improve resilience and disaster response in\nthe Caribbean.\n","authors":["Isabelle Tingzon","Nuala Margaret Cowan","Pierre Chrzanowski"],"pdf_url":"https://arxiv.org/pdf/2307.16177v2.pdf","comment":"2023 ICCV Humanitarian Assistance and Disaster Response Workshop"},{"id":"http://arxiv.org/abs/2308.03654v1","updated":"2023-08-07T15:10:21Z","published":"2023-08-07T15:10:21Z","title":"FFF: Fragments-Guided Flexible Fitting for Building Complete Protein\n  Structures","summary":"  Cryo-electron microscopy (cryo-EM) is a technique for reconstructing the\n3-dimensional (3D) structure of biomolecules (especially large protein\ncomplexes and molecular assemblies). As the resolution increases to the\nnear-atomic scale, building protein structures de novo from cryo-EM maps\nbecomes possible. Recently, recognition-based de novo building methods have\nshown the potential to streamline this process. However, it cannot build a\ncomplete structure due to the low signal-to-noise ratio (SNR) problem. At the\nsame time, AlphaFold has led to a great breakthrough in predicting protein\nstructures. This has inspired us to combine fragment recognition and structure\nprediction methods to build a complete structure. In this paper, we propose a\nnew method named FFF that bridges protein structure prediction and protein\nstructure recognition with flexible fitting. First, a multi-level recognition\nnetwork is used to capture various structural features from the input 3D\ncryo-EM map. Next, protein structural fragments are generated using pseudo\npeptide vectors and a protein sequence alignment method based on these\nextracted features. Finally, a complete structural model is constructed using\nthe predicted protein fragments via flexible fitting. Based on our benchmark\ntests, FFF outperforms the baseline methods for building complete protein\nstructures.\n","authors":["Weijie Chen","Xinyan Wang","Yuhang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03654v1.pdf","comment":"Published in the Proceedings of the IEEE/CVF Conference on Computer\n  Vision and Pattern Recognition (CVPR), 2023"},{"id":"http://arxiv.org/abs/2308.03652v1","updated":"2023-08-07T15:07:21Z","published":"2023-08-07T15:07:21Z","title":"WarpEM: Dynamic Time Warping for Accurate Catheter Registration in\n  EM-guided Procedures","summary":"  Accurate catheter tracking is crucial during minimally invasive endovascular\nprocedures (MIEP), and electromagnetic (EM) tracking is a widely used\ntechnology that serves this purpose. However, registration between preoperative\nimages and the EM tracking system is often challenging. Existing registration\nmethods typically require manual interactions, which can be time-consuming,\nincrease the risk of errors and change the procedural workflow. Although\nseveral registration methods are available for catheter tracking, such as\nmarker-based and path-based approaches, their limitations can impact the\naccuracy of the resulting tracking solution, consequently, the outcome of the\nmedical procedure.\n  This paper introduces a novel automated catheter registration method for\nEM-guided MIEP. The method utilizes 3D signal temporal analysis, such as\nDynamic Time Warping (DTW) algorithms, to improve registration accuracy and\nreliability compared to existing methods. DTW can accurately warp and match\nEM-tracked paths to the vessel's centerline, making it particularly suitable\nfor registration. The introduced registration method is evaluated for accuracy\nin a vascular phantom using a marker-based registration as the ground truth.\nThe results indicate that the DTW method yields accurate and reliable\nregistration outcomes, with a mean error of $2.22$mm. The introduced\nregistration method presents several advantages over state-of-the-art methods,\nsuch as high registration accuracy, no initialization required, and increased\nautomation.\n","authors":["Ardit Ramadani","Peter Ewert","Heribert Schunkert","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.03652v1.pdf","comment":"The 26th International Conference on Medical Image Computing and\n  Computer Assisted Intervention, MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03631v1","updated":"2023-08-07T14:36:49Z","published":"2023-08-07T14:36:49Z","title":"Segmentation Framework for Heat Loss Identification in Thermal Images:\n  Empowering Scottish Retrofitting and Thermographic Survey Companies","summary":"  Retrofitting and thermographic survey (TS) companies in Scotland collaborate\nwith social housing providers to tackle fuel poverty. They employ ground-level\ninfrared (IR) camera-based-TSs (GIRTSs) for collecting thermal images to\nidenti-fy the heat loss sources resulting from poor insulation. However, this\nidentifica-tion process is labor-intensive and time-consuming, necessitating\nextensive data processing. To automate this, an AI-driven approach is\nnecessary. Therefore, this study proposes a deep learning (DL)-based\nsegmentation framework using the Mask Region Proposal Convolutional Neural\nNetwork (Mask RCNN) to validate its applicability to these thermal images. The\nobjective of the framework is to au-tomatically identify, and crop heat loss\nsources caused by weak insulation, while also eliminating obstructive objects\npresent in those images. By doing so, it min-imizes labor-intensive tasks and\nprovides an automated, consistent, and reliable solution. To validate the\nproposed framework, approximately 2500 thermal imag-es were collected in\ncollaboration with industrial TS partner. Then, 1800 repre-sentative images\nwere carefully selected with the assistance of experts and anno-tated to\nhighlight the target objects (TO) to form the final dataset. Subsequently, a\ntransfer learning strategy was employed to train the dataset, progressively\naug-menting the training data volume and fine-tuning the pre-trained baseline\nMask RCNN. As a result, the final fine-tuned model achieved a mean average\nprecision (mAP) score of 77.2% for segmenting the TO, demonstrating the\nsignificant po-tential of proposed framework in accurately quantifying energy\nloss in Scottish homes.\n","authors":["Md Junayed Hasan","Eyad Elyan","Yijun Yan","Jinchang Ren","Md Mostafa Kamal Sarker"],"pdf_url":"https://arxiv.org/pdf/2308.03631v1.pdf","comment":"9 Pages, 3 Figures, Accepted from the conference - BICS 2023: 2023\n  International Conference on Brain-Inspired Cognitive Systems Kuala Lumpur,\n  Malaysia, August 5-6, 2023 [peer-reviewed]"},{"id":"http://arxiv.org/abs/2308.03624v1","updated":"2023-08-07T14:31:07Z","published":"2023-08-07T14:31:07Z","title":"MOMA-Force: Visual-Force Imitation for Real-World Mobile Manipulation","summary":"  In this paper, we present a novel method for mobile manipulators to perform\nmultiple contact-rich manipulation tasks. While learning-based methods have the\npotential to generate actions in an end-to-end manner, they often suffer from\ninsufficient action accuracy and robustness against noise. On the other hand,\nclassical control-based methods can enhance system robustness, but at the cost\nof extensive parameter tuning. To address these challenges, we present\nMOMA-Force, a visual-force imitation method that seamlessly combines\nrepresentation learning for perception, imitation learning for complex motion\ngeneration, and admittance whole-body control for system robustness and\ncontrollability. MOMA-Force enables a mobile manipulator to learn multiple\ncomplex contact-rich tasks with high success rates and small contact forces. In\na real household setting, our method outperforms baseline methods in terms of\ntask success rates. Moreover, our method achieves smaller contact forces and\nsmaller force variances compared to baseline methods without force imitation.\nOverall, we offer a promising approach for efficient and robust mobile\nmanipulation in the real world. Videos and more details can be found on\n\\url{https://visual-force-imitation.github.io}\n","authors":["Taozheng Yang","Ya Jing","Hongtao Wu","Jiafeng Xu","Kuankuan Sima","Guangzeng Chen","Qie Sima","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2308.03624v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS), 2023"},{"id":"http://arxiv.org/abs/2308.03620v1","updated":"2023-08-07T14:24:52Z","published":"2023-08-07T14:24:52Z","title":"Exploring Visual Pre-training for Robot Manipulation: Datasets, Models\n  and Methods","summary":"  Visual pre-training with large-scale real-world data has made great progress\nin recent years, showing great potential in robot learning with pixel\nobservations. However, the recipes of visual pre-training for robot\nmanipulation tasks are yet to be built. In this paper, we thoroughly\ninvestigate the effects of visual pre-training strategies on robot manipulation\ntasks from three fundamental perspectives: pre-training datasets, model\narchitectures and training methods. Several significant experimental findings\nare provided that are beneficial for robot learning. Further, we propose a\nvisual pre-training scheme for robot manipulation termed Vi-PRoM, which\ncombines self-supervised learning and supervised learning. Concretely, the\nformer employs contrastive learning to acquire underlying patterns from\nlarge-scale unlabeled data, while the latter aims learning visual semantics and\ntemporal dynamics. Extensive experiments on robot manipulations in various\nsimulation environments and the real robot demonstrate the superiority of the\nproposed scheme. Videos and more details can be found on\n\\url{https://explore-pretrain-robot.github.io}.\n","authors":["Ya Jing","Xuelin Zhu","Xingbin Liu","Qie Sima","Taozheng Yang","Yunhai Feng","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2308.03620v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS), 2023"},{"id":"http://arxiv.org/abs/2308.03613v1","updated":"2023-08-07T14:16:52Z","published":"2023-08-07T14:16:52Z","title":"Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous\n  Labels","summary":"  Accurate segmentation of brain vessels is crucial for cerebrovascular disease\ndiagnosis and treatment. However, existing methods face challenges in capturing\nsmall vessels and handling datasets that are partially or ambiguously\nannotated. In this paper, we propose an adaptive semi-supervised approach to\naddress these challenges. Our approach incorporates innovative techniques\nincluding progressive semi-supervised learning, adaptative training strategy,\nand boundary enhancement. Experimental results on 3DRA datasets demonstrate the\nsuperiority of our method in terms of mesh-based segmentation metrics. By\nleveraging the partially and ambiguously labeled data, which only annotates the\nmain vessels, our method achieves impressive segmentation performance on\nmislabeled fine vessels, showcasing its potential for clinical applications.\n","authors":["Fengming Lin","Yan Xia","Nishant Ravikumar","Qiongyao Liu","Michael MacRaild","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2308.03613v1.pdf","comment":"Accepted by DALI MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03610v1","updated":"2023-08-07T14:09:46Z","published":"2023-08-07T14:09:46Z","title":"AvatarVerse: High-quality & Stable 3D Avatar Creation from Text and Pose","summary":"  Creating expressive, diverse and high-quality 3D avatars from highly\ncustomized text descriptions and pose guidance is a challenging task, due to\nthe intricacy of modeling and texturing in 3D that ensure details and various\nstyles (realistic, fictional, etc). We present AvatarVerse, a stable pipeline\nfor generating expressive high-quality 3D avatars from nothing but text\ndescriptions and pose guidance. In specific, we introduce a 2D diffusion model\nconditioned on DensePose signal to establish 3D pose control of avatars through\n2D images, which enhances view consistency from partially observed scenarios.\nIt addresses the infamous Janus Problem and significantly stablizes the\ngeneration process. Moreover, we propose a progressive high-resolution 3D\nsynthesis strategy, which obtains substantial improvement over the quality of\nthe created 3D avatars. To this end, the proposed AvatarVerse pipeline achieves\nzero-shot 3D modeling of 3D avatars that are not only more expressive, but also\nin higher quality and fidelity than previous works. Rigorous qualitative\nevaluations and user studies showcase AvatarVerse's superiority in synthesizing\nhigh-fidelity 3D avatars, leading to a new standard in high-quality and stable\n3D avatar creation. Our project page is: https://avatarverse3d.github.io\n","authors":["Huichao Zhang","Bowen Chen","Hao Yang","Liao Qu","Xu Wang","Li Chen","Chao Long","Feida Zhu","Kang Du","Min Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.03610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03608v1","updated":"2023-08-07T14:09:08Z","published":"2023-08-07T14:09:08Z","title":"Recurrent Self-Supervised Video Denoising with Denser Receptive Field","summary":"  Self-supervised video denoising has seen decent progress through the use of\nblind spot networks. However, under their blind spot constraints, previous\nself-supervised video denoising methods suffer from significant information\nloss and texture destruction in either the whole reference frame or neighbor\nframes, due to their inadequate consideration of the receptive field. Moreover,\nthe limited number of available neighbor frames in previous methods leads to\nthe discarding of distant temporal information. Nonetheless, simply adopting\nexisting recurrent frameworks does not work, since they easily break the\nconstraints on the receptive field imposed by self-supervision. In this paper,\nwe propose RDRF for self-supervised video denoising, which not only fully\nexploits both the reference and neighbor frames with a denser receptive field,\nbut also better leverages the temporal information from both local and distant\nneighbor features. First, towards a comprehensive utilization of information\nfrom both reference and neighbor frames, RDRF realizes a denser receptive field\nby taking more neighbor pixels along the spatial and temporal dimensions.\nSecond, it features a self-supervised recurrent video denoising framework,\nwhich concurrently integrates distant and near-neighbor temporal features. This\nenables long-term bidirectional information aggregation, while mitigating error\naccumulation in the plain recurrent framework. Our method exhibits superior\nperformance on both synthetic and real video denoising datasets. Codes will be\navailable at https://github.com/Wang-XIaoDingdd/RDRF.\n","authors":["Zichun Wang","Yulun Zhang","Debing Zhang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2308.03608v1.pdf","comment":"Accepted to ACMMM 2023"},{"id":"http://arxiv.org/abs/2303.14643v2","updated":"2023-08-07T14:08:44Z","published":"2023-03-26T06:59:23Z","title":"POAR: Towards Open Vocabulary Pedestrian Attribute Recognition","summary":"  Pedestrian attribute recognition (PAR) aims to predict the attributes of a\ntarget pedestrian in a surveillance system. Existing methods address the PAR\nproblem by training a multi-label classifier with predefined attribute classes.\nHowever, it is impossible to exhaust all pedestrian attributes in the real\nworld. To tackle this problem, we develop a novel pedestrian open-attribute\nrecognition (POAR) framework. Our key idea is to formulate the POAR problem as\nan image-text search problem. We design a Transformer-based image encoder with\na masking strategy. A set of attribute tokens are introduced to focus on\nspecific pedestrian parts (e.g., head, upper body, lower body, feet, etc.) and\nencode corresponding attributes into visual embeddings. Each attribute category\nis described as a natural language sentence and encoded by the text encoder.\nThen, we compute the similarity between the visual and text embeddings of\nattributes to find the best attribute descriptions for the input images.\nDifferent from existing methods that learn a specific classifier for each\nattribute category, we model the pedestrian at a part-level and explore the\nsearching method to handle the unseen attributes. Finally, a many-to-many\ncontrastive (MTMC) loss with masked tokens is proposed to train the network\nsince a pedestrian image can comprise multiple attributes. Extensive\nexperiments have been conducted on benchmark PAR datasets with an\nopen-attribute setting. The results verified the effectiveness of the proposed\nPOAR method, which can form a strong baseline for the POAR task. Our code is\navailable at \\url{https://github.com/IvyYZ/POAR}.\n","authors":["Yue Zhang","Suchen Wang","Shichao Kan","Zhenyu Weng","Yigang Cen","Yap-peng Tan"],"pdf_url":"https://arxiv.org/pdf/2303.14643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03594v1","updated":"2023-08-07T13:52:21Z","published":"2023-08-07T13:52:21Z","title":"FeatEnHancer: Enhancing Hierarchical Features for Object Detection and\n  Beyond Under Low-Light Vision","summary":"  Extracting useful visual cues for the downstream tasks is especially\nchallenging under low-light vision. Prior works create enhanced representations\nby either correlating visual quality with machine perception or designing\nillumination-degrading transformation methods that require pre-training on\nsynthetic datasets. We argue that optimizing enhanced image representation\npertaining to the loss of the downstream task can result in more expressive\nrepresentations. Therefore, in this work, we propose a novel module,\nFeatEnHancer, that hierarchically combines multiscale features using\nmultiheaded attention guided by task-related loss function to create suitable\nrepresentations. Furthermore, our intra-scale enhancement improves the quality\nof features extracted at each scale or level, as well as combines features from\ndifferent scales in a way that reflects their relative importance for the task\nat hand. FeatEnHancer is a general-purpose plug-and-play module and can be\nincorporated into any low-light vision pipeline. We show with extensive\nexperimentation that the enhanced representation produced with FeatEnHancer\nsignificantly and consistently improves results in several low-light vision\ntasks, including dark object detection (+5.7 mAP on ExDark), face detection\n(+1.5 mAPon DARK FACE), nighttime semantic segmentation (+5.1 mIoU on ACDC ),\nand video object detection (+1.8 mAP on DarkVision), highlighting the\neffectiveness of enhancing hierarchical features under low-light vision.\n","authors":["Khurram Azeem Hashmi","Goutham Kallempudi","Didier Stricker","Muhammamd Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2308.03594v1.pdf","comment":"19 pages, 9 Figures, and 10 Tables. Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.03586v1","updated":"2023-08-07T13:44:44Z","published":"2023-08-07T13:44:44Z","title":"SoilNet: An Attention-based Spatio-temporal Deep Learning Framework for\n  Soil Organic Carbon Prediction with Digital Soil Mapping in Europe","summary":"  Digital soil mapping (DSM) is an advanced approach that integrates\nstatistical modeling and cutting-edge technologies, including machine learning\n(ML) methods, to accurately depict soil properties and their spatial\ndistribution. Soil organic carbon (SOC) is a crucial soil attribute providing\nvaluable insights into soil health, nutrient cycling, greenhouse gas emissions,\nand overall ecosystem productivity. This study highlights the significance of\nspatial-temporal deep learning (DL) techniques within the DSM framework. A\nnovel architecture is proposed, incorporating spatial information using a base\nconvolutional neural network (CNN) model and spatial attention mechanism, along\nwith climate temporal information using a long short-term memory (LSTM)\nnetwork, for SOC prediction across Europe. The model utilizes a comprehensive\nset of environmental features, including Landsat-8 images, topography, remote\nsensing indices, and climate time series, as input features. Results\ndemonstrate that the proposed framework outperforms conventional ML approaches\nlike random forest commonly used in DSM, yielding lower root mean square error\n(RMSE). This model is a robust tool for predicting SOC and could be applied to\nother soil properties, thereby contributing to the advancement of DSM\ntechniques and facilitating land management and decision-making processes based\non accurate information.\n","authors":["Nafiseh Kakhani","Moien Rangzan","Ali Jamali","Sara Attarchi","Seyed Kazem Alavipanah","Thomas Scholten"],"pdf_url":"https://arxiv.org/pdf/2308.03586v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.03580v1","updated":"2023-08-07T13:35:53Z","published":"2023-08-07T13:35:53Z","title":"Revealing the Underlying Patterns: Investigating Dataset Similarity,\n  Performance, and Generalization","summary":"  Supervised deep learning models require significant amount of labelled data\nto achieve an acceptable performance on a specific task. However, when tested\non unseen data, the models may not perform well. Therefore, the models need to\nbe trained with additional and varying labelled data to improve the\ngeneralization. In this work, our goal is to understand the models, their\nperformance and generalization. We establish image-image, dataset-dataset, and\nimage-dataset distances to gain insights into the model's behavior. Our\nproposed distance metric when combined with model performance can help in\nselecting an appropriate model/architecture from a pool of candidate\narchitectures. We have shown that the generalization of these models can be\nimproved by only adding a small number of unseen images (say 1, 3 or 7) into\nthe training set. Our proposed approach reduces training and annotation costs\nwhile providing an estimate of model performance on unseen data in dynamic\nenvironments.\n","authors":["Akshit Achara","Ram Krishna Pandey"],"pdf_url":"https://arxiv.org/pdf/2308.03580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08083v4","updated":"2023-08-07T13:24:06Z","published":"2022-06-16T10:53:18Z","title":"CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation\n  from Simulation to multiple Real-World Domains","summary":"  Unsupervised Domain Adaptation demonstrates great potential to mitigate\ndomain shifts by transferring models from labeled source domains to unlabeled\ntarget domains. While Unsupervised Domain Adaptation has been applied to a wide\nvariety of complex vision tasks, only few works focus on lane detection for\nautonomous driving. This can be attributed to the lack of publicly available\ndatasets. To facilitate research in these directions, we propose CARLANE, a\n3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE\nencompasses the single-target datasets MoLane and TuLane and the multi-target\ndataset MuLane. These datasets are built from three different domains, which\ncover diverse scenes and contain a total of 163K unique images, 118K of which\nare annotated. In addition we evaluate and report systematic baselines,\nincluding our own method, which builds upon Prototypical Cross-domain\nSelf-supervised Learning. We find that false positive and false negative rates\nof the evaluated domain adaptation methods are high compared to those of fully\nsupervised baselines. This affirms the need for benchmarks such as CARLANE to\nfurther strengthen research in Unsupervised Domain Adaptation for lane\ndetection. CARLANE, all evaluated models and the corresponding implementations\nare publicly available at https://carlanebenchmark.github.io.\n","authors":["Julian Gebele","Bonifaz Stuhr","Johann Haselberger"],"pdf_url":"https://arxiv.org/pdf/2206.08083v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.09534v2","updated":"2023-08-07T13:00:47Z","published":"2023-04-19T09:52:50Z","title":"Realistic Data Enrichment for Robust Image Segmentation in\n  Histopathology","summary":"  Poor performance of quantitative analysis in histopathological Whole Slide\nImages (WSI) has been a significant obstacle in clinical practice. Annotating\nlarge-scale WSIs manually is a demanding and time-consuming task, unlikely to\nyield the expected results when used for fully supervised learning systems.\nRarely observed disease patterns and large differences in object scales are\ndifficult to model through conventional patient intake. Prior methods either\nfall back to direct disease classification, which only requires learning a few\nfactors per image, or report on average image segmentation performance, which\nis highly biased towards majority observations. Geometric image augmentation is\ncommonly used to improve robustness for average case predictions and to enrich\nlimited datasets. So far no method provided sampling of a realistic posterior\ndistribution to improve stability, e.g. for the segmentation of imbalanced\nobjects within images. Therefore, we propose a new approach, based on diffusion\nmodels, which can enrich an imbalanced dataset with plausible examples from\nunderrepresented groups by conditioning on segmentation maps. Our method can\nsimply expand limited clinical datasets making them suitable to train machine\nlearning pipelines, and provides an interpretable and human-controllable way of\ngenerating histopathology images that are indistinguishable from real ones to\nhuman experts. We validate our findings on two datasets, one from the public\ndomain and one from a Kidney Transplant study.\n","authors":["Sarah Cechnicka","James Ball","Hadrien Reynaud","Callum Arthurs","Candice Roufosse","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2304.09534v2.pdf","comment":"11 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.03529v1","updated":"2023-08-07T12:26:34Z","published":"2023-08-07T12:26:34Z","title":"Feature Decoupling-Recycling Network for Fast Interactive Segmentation","summary":"  Recent interactive segmentation methods iteratively take source image, user\nguidance and previously predicted mask as the input without considering the\ninvariant nature of the source image. As a result, extracting features from the\nsource image is repeated in each interaction, resulting in substantial\ncomputational redundancy. In this work, we propose the Feature\nDecoupling-Recycling Network (FDRN), which decouples the modeling components\nbased on their intrinsic discrepancies and then recycles components for each\nuser interaction. Thus, the efficiency of the whole interactive process can be\nsignificantly improved. To be specific, we apply the Decoupling-Recycling\nstrategy from three perspectives to address three types of discrepancies,\nrespectively. First, our model decouples the learning of source image semantics\nfrom the encoding of user guidance to process two types of input domains\nseparately. Second, FDRN decouples high-level and low-level features from\nstratified semantic representations to enhance feature learning. Third, during\nthe encoding of user guidance, current user guidance is decoupled from\nhistorical guidance to highlight the effect of current user guidance. We\nconduct extensive experiments on 6 datasets from different domains and\nmodalities, which demonstrate the following merits of our model: 1) superior\nefficiency than other methods, particularly advantageous in challenging\nscenarios requiring long-term interactions (up to 4.25x faster), while\nachieving favorable segmentation performance; 2) strong applicability to\nvarious methods serving as a universal enhancement technique; 3) well\ncross-task generalizability, e.g., to medical image segmentation, and\nrobustness against misleading user guidance.\n","authors":["Huimin Zeng","Weinong Wang","Xin Tao","Zhiwei Xiong","Yu-Wing Tai","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03529v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.14863v2","updated":"2023-08-07T12:13:05Z","published":"2023-07-27T13:49:27Z","title":"IML-ViT: Benchmarking Image Manipulation Localization by Vision\n  Transformer","summary":"  Advanced image tampering techniques are increasingly challenging the\ntrustworthiness of multimedia, leading to the development of Image Manipulation\nLocalization (IML). But what makes a good IML model? The answer lies in the way\nto capture artifacts. Exploiting artifacts requires the model to extract\nnon-semantic discrepancies between manipulated and authentic regions,\nnecessitating explicit comparisons between the two areas. With the\nself-attention mechanism, naturally, the Transformer should be a better\ncandidate to capture artifacts. However, due to limited datasets, there is\ncurrently no pure ViT-based approach for IML to serve as a benchmark, and CNNs\ndominate the entire task. Nevertheless, CNNs suffer from weak long-range and\nnon-semantic modeling. To bridge this gap, based on the fact that artifacts are\nsensitive to image resolution, amplified under multi-scale features, and\nmassive at the manipulation border, we formulate the answer to the former\nquestion as building a ViT with high-resolution capacity, multi-scale feature\nextraction capability, and manipulation edge supervision that could converge\nwith a small amount of data. We term this simple but effective ViT paradigm\nIML-ViT, which has significant potential to become a new benchmark for IML.\nExtensive experiments on five benchmark datasets verified our model outperforms\nthe state-of-the-art manipulation localization methods.Code and models are\navailable at \\url{https://github.com/SunnyHaze/IML-ViT}.\n","authors":["Xiaochen Ma","Bo Du","Zhuohang Jiang","Ahmed Y. Al Hammadi","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.14863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03515v1","updated":"2023-08-07T12:11:04Z","published":"2023-08-07T12:11:04Z","title":"Keyword Spotting Simplified: A Segmentation-Free Approach using\n  Character Counting and CTC re-scoring","summary":"  Recent advances in segmentation-free keyword spotting treat this problem\nw.r.t. an object detection paradigm and borrow from state-of-the-art detection\nsystems to simultaneously propose a word bounding box proposal mechanism and\ncompute a corresponding representation. Contrary to the norm of such methods\nthat rely on complex and large DNN models, we propose a novel segmentation-free\nsystem that efficiently scans a document image to find rectangular areas that\ninclude the query information. The underlying model is simple and compact,\npredicting character occurrences over rectangular areas through an implicitly\nlearned scale map, trained on word-level annotated images. The proposed\ndocument scanning is then performed using this character counting in a\ncost-effective manner via integral images and binary search. Finally, the\nretrieval similarity by character counting is refined by a pyramidal\nrepresentation and a CTC-based re-scoring algorithm, fully utilizing the\ntrained CNN model. Experimental validation on two widely-used datasets shows\nthat our method achieves state-of-the-art results outperforming the more\ncomplex alternatives, despite the simplicity of the underlying model.\n","authors":["George Retsinas","Giorgos Sfikas","Christophoros Nikou"],"pdf_url":"https://arxiv.org/pdf/2308.03515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03495v1","updated":"2023-08-07T11:42:50Z","published":"2023-08-07T11:42:50Z","title":"Balanced Face Dataset: Guiding StyleGAN to Generate Labeled Synthetic\n  Face Image Dataset for Underrepresented Group","summary":"  For a machine learning model to generalize effectively to unseen data within\na particular problem domain, it is well-understood that the data needs to be of\nsufficient size and representative of real-world scenarios. Nonetheless,\nreal-world datasets frequently have overrepresented and underrepresented\ngroups. One solution to mitigate bias in machine learning is to leverage a\ndiverse and representative dataset. Training a model on a dataset that covers\nall demographics is crucial to reducing bias in machine learning. However,\ncollecting and labeling large-scale datasets has been challenging, prompting\nthe use of synthetic data generation and active labeling to decrease the costs\nof manual labeling. The focus of this study was to generate a robust face image\ndataset using the StyleGAN model. In order to achieve a balanced distribution\nof the dataset among different demographic groups, a synthetic dataset was\ncreated by controlling the generation process of StyleGaN and annotated for\ndifferent downstream tasks.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2308.03495v1.pdf","comment":"7 pages, 7 figures,submitted to AMLD Africa 2021 conference"},{"id":"http://arxiv.org/abs/2208.11176v3","updated":"2023-08-07T11:36:16Z","published":"2022-08-23T20:04:17Z","title":"A Study on the Impact of Data Augmentation for Training Convolutional\n  Neural Networks in the Presence of Noisy Labels","summary":"  Label noise is common in large real-world datasets, and its presence harms\nthe training process of deep neural networks. Although several works have\nfocused on the training strategies to address this problem, there are few\nstudies that evaluate the impact of data augmentation as a design choice for\ntraining deep neural networks. In this work, we analyse the model robustness\nwhen using different data augmentations and their improvement on the training\nwith the presence of noisy labels. We evaluate state-of-the-art and classical\ndata augmentation strategies with different levels of synthetic noise for the\ndatasets MNist, CIFAR-10, CIFAR-100, and the real-world dataset Clothing1M. We\nevaluate the methods using the accuracy metric. Results show that the\nappropriate selection of data augmentation can drastically improve the model\nrobustness to label noise, increasing up to 177.84% of relative best test\naccuracy compared to the baseline with no augmentation, and an increase of up\nto 6% in absolute value with the state-of-the-art DivideMix training strategy.\n","authors":["Emeson Santana","Gustavo Carneiro","Filipe R. Cordeiro"],"pdf_url":"https://arxiv.org/pdf/2208.11176v3.pdf","comment":"Paper accepted at SIBGRAPI 2022"},{"id":"http://arxiv.org/abs/2308.03492v1","updated":"2023-08-07T11:34:27Z","published":"2023-08-07T11:34:27Z","title":"Learning Photometric Feature Transform for Free-form Object Scan","summary":"  We propose a novel framework to automatically learn to aggregate and\ntransform photometric measurements from multiple unstructured views into\nspatially distinctive and view-invariant low-level features, which are fed to a\nmulti-view stereo method to enhance 3D reconstruction. The illumination\nconditions during acquisition and the feature transform are jointly trained on\na large amount of synthetic data. We further build a system to reconstruct the\ngeometry and anisotropic reflectance of a variety of challenging objects from\nhand-held scans. The effectiveness of the system is demonstrated with a\nlightweight prototype, consisting of a camera and an array of LEDs, as well as\nan off-the-shelf tablet. Our results are validated against reconstructions from\na professional 3D scanner and photographs, and compare favorably with\nstate-of-the-art techniques.\n","authors":["Xiang Feng","Kaizhang Kang","Fan Pei","Huakeng Ding","Jinjiang You","Ping Tan","Kun Zhou","Hongzhi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13984v4","updated":"2023-08-07T11:29:26Z","published":"2022-10-24T07:43:59Z","title":"Abductive Action Inference","summary":"  Abductive reasoning aims to make the most likely inference for a given set of\nincomplete observations. In this paper, we introduce a novel research task\nknown as \"abductive action inference\" which addresses the question of which\nactions were executed by a human to reach a specific state shown in a single\nsnapshot. The research explores three key abductive inference problems: action\nset prediction, action sequence prediction, and abductive action verification.\nTo tackle these challenging tasks, we investigate various models, including\nestablished ones such as Transformers, Graph Neural Networks, CLIP, BLIP, GPT3,\nend-to-end trained Slow-Fast, Resnet50-3D, and ViT models. Furthermore, the\npaper introduces several innovative models tailored for abductive action\ninference, including a relational graph neural network, a relational bilinear\npooling model, a relational rule-based inference model, a relational GPT-3\nprompt method, and a relational Transformer model. Notably, the newly proposed\nobject-relational bilinear graph encoder-decoder (BiGED) model emerges as the\nmost effective among all methods evaluated, demonstrating good proficiency in\nhandling the intricacies of the Action Genome dataset. The contributions of\nthis research offer significant progress toward comprehending the implications\nof human actions and making highly plausible inferences concerning the outcomes\nof these actions.\n","authors":["Clement Tan","Chai Kiat Yeo","Cheston Tan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2210.13984v4.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.03486v1","updated":"2023-08-07T11:28:36Z","published":"2023-08-07T11:28:36Z","title":"Improving Mass Detection in Mammography Images: A Study of Weakly\n  Supervised Learning and Class Activation Map Methods","summary":"  In recent years, weakly supervised models have aided in mass detection using\nmammography images, decreasing the need for pixel-level annotations. However,\nmost existing models in the literature rely on Class Activation Maps (CAM) as\nthe activation method, overlooking the potential benefits of exploring other\nactivation techniques. This work presents a study that explores and compares\ndifferent activation maps in conjunction with state-of-the-art methods for\nweakly supervised training in mammography images. Specifically, we investigate\nCAM, GradCAM, GradCAM++, XGradCAM, and LayerCAM methods within the framework of\nthe GMIC model for mass detection in mammography images. The evaluation is\nconducted on the VinDr-Mammo dataset, utilizing the metrics Accuracy, True\nPositive Rate (TPR), False Negative Rate (FNR), and False Positive Per Image\n(FPPI). Results show that using different strategies of activation maps during\ntraining and test stages leads to an improvement of the model. With this\nstrategy, we improve the results of the GMIC method, decreasing the FPPI value\nand increasing TPR.\n","authors":["Vicente Sampaio","Filipe R. Cordeiro"],"pdf_url":"https://arxiv.org/pdf/2308.03486v1.pdf","comment":"Accepted for publication at SIBGRAPI 20203"},{"id":"http://arxiv.org/abs/2307.08265v2","updated":"2023-08-07T11:21:31Z","published":"2023-07-17T06:14:19Z","title":"Extreme Image Compression using Fine-tuned VQGAN Models","summary":"  Recent advances in generative compression methods have demonstrated\nremarkable progress in enhancing the perceptual quality of compressed data,\nespecially in scenarios with low bitrates. Nevertheless, their efficacy and\napplicability in achieving extreme compression ratios ($<0.1$ bpp) still remain\nconstrained. In this work, we propose a simple yet effective coding framework\nby introducing vector quantization (VQ)-based generative models into the image\ncompression domain. The main insight is that the codebook learned by the VQGAN\nmodel yields strong expressive capacity, facilitating efficient compression of\ncontinuous information in the latent space while maintaining reconstruction\nquality. Specifically, an image can be represented as VQ-indices by finding the\nnearest codeword, which can be encoded using lossless compression methods into\nbitstreams. We then propose clustering a pre-trained large-scale codebook into\nsmaller codebooks using the K-means algorithm. This enables images to be\nrepresented as diverse ranges of VQ-indices maps, resulting in variable\nbitrates and different levels of reconstruction quality. Extensive qualitative\nand quantitative experiments on various datasets demonstrate that the proposed\nframework outperforms the state-of-the-art codecs in terms of perceptual\nquality-oriented metrics and human perception under extremely low bitrates.\n","authors":["Qi Mao","Tinghan Yang","Yinuo Zhang","Shuyin Pan","Meng Wang","Shiqi Wang","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2307.08265v2.pdf","comment":"Generative Compression, Extreme Compression, VQGANs, Low Bitrate"},{"id":"http://arxiv.org/abs/2308.03476v1","updated":"2023-08-07T11:09:12Z","published":"2023-08-07T11:09:12Z","title":"Exploring the Physical World Adversarial Robustness of Vehicle Detection","summary":"  Adversarial attacks can compromise the robustness of real-world detection\nmodels. However, evaluating these models under real-world conditions poses\nchallenges due to resource-intensive experiments. Virtual simulations offer an\nalternative, but the absence of standardized benchmarks hampers progress.\nAddressing this, we propose an innovative instant-level data generation\npipeline using the CARLA simulator. Through this pipeline, we establish the\nDiscrete and Continuous Instant-level (DCI) dataset, enabling comprehensive\nexperiments involving three detection models and three physical adversarial\nattacks. Our findings highlight diverse model performances under adversarial\nconditions. Yolo v6 demonstrates remarkable resilience, experiencing just a\nmarginal 6.59% average drop in average precision (AP). In contrast, the ASA\nattack yields a substantial 14.51% average AP reduction, twice the effect of\nother algorithms. We also note that static scenes yield higher recognition AP\nvalues, and outcomes remain relatively consistent across varying weather\nconditions. Intriguingly, our study suggests that advancements in adversarial\nattack algorithms may be approaching its ``limitation''.In summary, our work\nunderscores the significance of adversarial attacks in real-world contexts and\nintroduces the DCI dataset as a versatile benchmark. Our findings provide\nvaluable insights for enhancing the robustness of detection models and offer\nguidance for future research endeavors in the realm of adversarial attacks.\n","authors":["Wei Jiang","Tianyuan Zhang","Shuangcheng Liu","Weiyu Ji","Zichao Zhang","Gang Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03471v1","updated":"2023-08-07T10:57:20Z","published":"2023-08-07T10:57:20Z","title":"Deepfake Detection: A Comparative Analysis","summary":"  This paper present a comprehensive comparative analysis of supervised and\nself-supervised models for deepfake detection. We evaluate eight supervised\ndeep learning architectures and two transformer-based models pre-trained using\nself-supervised strategies (DINO, CLIP) on four benchmarks (FakeAVCeleb,\nCelebDF-V2, DFDC, and FaceForensics++). Our analysis includes intra-dataset and\ninter-dataset evaluations, examining the best performing models, generalisation\ncapabilities, and impact of augmentations. We also investigate the trade-off\nbetween model size and performance. Our main goal is to provide insights into\nthe effectiveness of different deep learning architectures (transformers,\nCNNs), training strategies (supervised, self-supervised), and deepfake\ndetection benchmarks. These insights can help guide the development of more\naccurate and reliable deepfake detection systems, which are crucial in\nmitigating the harmful impact of deepfakes on individuals and society.\n","authors":["Sohail Ahmed Khan","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.03471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03467v1","updated":"2023-08-07T10:47:08Z","published":"2023-08-07T10:47:08Z","title":"RoadScan: A Novel and Robust Transfer Learning Framework for Autonomous\n  Pothole Detection in Roads","summary":"  This research paper presents a novel approach to pothole detection using Deep\nLearning and Image Processing techniques. The proposed system leverages the\nVGG16 model for feature extraction and utilizes a custom Siamese network with\ntriplet loss, referred to as RoadScan. The system aims to address the critical\nissue of potholes on roads, which pose significant risks to road users.\nAccidents due to potholes on the roads have led to numerous accidents. Although\nit is necessary to completely remove potholes, it is a time-consuming process.\nHence, a general road user should be able to detect potholes from a safe\ndistance in order to avoid damage. Existing methods for pothole detection\nheavily rely on object detection algorithms which tend to have a high chance of\nfailure owing to the similarity in structures and textures of a road and a\npothole. Additionally, these systems utilize millions of parameters thereby\nmaking the model difficult to use in small-scale applications for the general\ncitizen. By analyzing diverse image processing methods and various\nhigh-performing networks, the proposed model achieves remarkable performance in\naccurately detecting potholes. Evaluation metrics such as accuracy, EER,\nprecision, recall, and AUROC validate the effectiveness of the system.\nAdditionally, the proposed model demonstrates computational efficiency and\ncost-effectiveness by utilizing fewer parameters and data for training. The\nresearch highlights the importance of technology in the transportation sector\nand its potential to enhance road safety and convenience. The network proposed\nin this model performs with a 96.12 % accuracy, 3.89 % EER, and a 0.988 AUROC\nvalue, which is highly competitive with other state-of-the-art works.\n","authors":["Guruprasad Parasnis","Anmol Chokshi","Kailas Devadkar"],"pdf_url":"https://arxiv.org/pdf/2308.03467v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.15063v2","updated":"2023-08-07T10:43:33Z","published":"2023-07-27T17:59:59Z","title":"To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation","summary":"  The goal of Online Domain Adaptation for semantic segmentation is to handle\nunforeseeable domain changes that occur during deployment, like sudden weather\nevents. However, the high computational costs associated with brute-force\nadaptation make this paradigm unfeasible for real-world applications. In this\npaper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training\nframework for real-time domain adaptation. Our approach includes a\nhardware-aware back-propagation orchestration agent (HAMT) and a dedicated\ndomain-shift detector that enables active control over when and how the model\nis adapted (LT). Thanks to these advancements, our approach is capable of\nperforming semantic segmentation while simultaneously adapting at more than\n29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and\nspeed trade-off is demonstrated on OnDA and SHIFT benchmarks through\nexperimental results.\n","authors":["Marc Botet Colomer","Pier Luigi Dovesi","Theodoros Panagiotakopoulos","Joao Frederico Carvalho","Linus Härenstam-Nielsen","Hossein Azizpour","Hedvig Kjellström","Daniel Cremers","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2307.15063v2.pdf","comment":"ICCV 2023. The first two authors contributed equally. Project page:\n  https://marcbotet.github.io/hamlet-web/"},{"id":"http://arxiv.org/abs/2308.03463v1","updated":"2023-08-07T10:41:52Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.03463v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2210.15808v2","updated":"2023-08-07T10:33:34Z","published":"2022-10-28T00:03:43Z","title":"Hyper-Connected Transformer Network for Multi-Modality PET-CT\n  Segmentation","summary":"  [18F]-Fluorodeoxyglucose (FDG) positron emission tomography - computed\ntomography (PET-CT) has become the imaging modality of choice for diagnosing\nmany cancers. Co-learning complementary PET-CT imaging features is a\nfundamental requirement for automatic tumor segmentation and for developing\ncomputer aided cancer diagnosis systems. In this study, we propose a\nhyper-connected transformer (HCT) network that integrates a transformer network\n(TN) with a hyper connected fusion for multi-modality PET-CT images. The TN was\nleveraged for its ability to provide global dependencies in image feature\nlearning, which was achieved by using image patch embeddings with a\nself-attention mechanism to capture image-wide contextual information. We\nextended the single-modality definition of TN with multiple TN based branches\nto separately extract image features. We also introduced a hyper connected\nfusion to fuse the contextual and complementary image features across multiple\ntransformers in an iterative manner. Our results with two clinical datasets\nshow that HCT achieved better performance in segmentation accuracy when\ncompared to the existing methods.\n","authors":["Lei Bi","Michael Fulham","Shaoli Song","David Dagan Feng","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2210.15808v2.pdf","comment":"EMBC 2023"},{"id":"http://arxiv.org/abs/2308.03457v1","updated":"2023-08-07T10:25:54Z","published":"2023-08-07T10:25:54Z","title":"Cross-Silo Prototypical Calibration for Federated Learning with Non-IID\n  Data","summary":"  Federated Learning aims to learn a global model on the server side that\ngeneralizes to all clients in a privacy-preserving manner, by leveraging the\nlocal models from different clients. Existing solutions focus on either\nregularizing the objective functions among clients or improving the aggregation\nmechanism for the improved model generalization capability. However, their\nperformance is typically limited by the dataset biases, such as the\nheterogeneous data distributions and the missing classes. To address this\nissue, this paper presents a cross-silo prototypical calibration method\n(FedCSPC), which takes additional prototype information from the clients to\nlearn a unified feature space on the server side. Specifically, FedCSPC first\nemploys the Data Prototypical Modeling (DPM) module to learn data patterns via\nclustering to aid calibration. Subsequently, the cross-silo prototypical\ncalibration (CSPC) module develops an augmented contrastive learning method to\nimprove the robustness of the calibration, which can effectively project\ncross-source features into a consistent space while maintaining clear decision\nboundaries. Moreover, the CSPC module's ease of implementation and\nplug-and-play characteristics make it even more remarkable. Experiments were\nconducted on four datasets in terms of performance comparison, ablation study,\nin-depth analysis and case study, and the results verified that FedCSPC is\ncapable of learning the consistent features across different data sources of\nthe same class under the guidance of calibrated model, which leads to better\nperformance than the state-of-the-art methods. The source codes have been\nreleased at https://github.com/qizhuang-qz/FedCSPC.\n","authors":["Zhuang Qi","Lei Meng","Zitan Chen","Han Hu","Hui Lin","Xiangxu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.03457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12043v2","updated":"2023-08-07T10:20:59Z","published":"2023-04-24T12:38:09Z","title":"MixPro: Data Augmentation with MaskMix and Progressive Attention\n  Labeling for Vision Transformer","summary":"  The recently proposed data augmentation TransMix employs attention labels to\nhelp visual transformers (ViT) achieve better robustness and performance.\nHowever, TransMix is deficient in two aspects: 1) The image cropping method of\nTransMix may not be suitable for ViTs. 2) At the early stage of training, the\nmodel produces unreliable attention maps. TransMix uses unreliable attention\nmaps to compute mixed attention labels that can affect the model. To address\nthe aforementioned issues, we propose MaskMix and Progressive Attention\nLabeling (PAL) in image and label space, respectively. In detail, from the\nperspective of image space, we design MaskMix, which mixes two images based on\na patch-like grid mask. In particular, the size of each mask patch is\nadjustable and is a multiple of the image patch size, which ensures each image\npatch comes from only one image and contains more global contents. From the\nperspective of label space, we design PAL, which utilizes a progressive factor\nto dynamically re-weight the attention weights of the mixed attention label.\nFinally, we combine MaskMix and Progressive Attention Labeling as our new data\naugmentation method, named MixPro. The experimental results show that our\nmethod can improve various ViT-based models at scales on ImageNet\nclassification (73.8\\% top-1 accuracy based on DeiT-T for 300 epochs). After\nbeing pre-trained with MixPro on ImageNet, the ViT-based models also\ndemonstrate better transferability to semantic segmentation, object detection,\nand instance segmentation. Furthermore, compared to TransMix, MixPro also shows\nstronger robustness on several benchmarks. The code is available at\nhttps://github.com/fistyee/MixPro.\n","authors":["Qihao Zhao","Yangyu Huang","Wei Hu","Fan Zhang","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2304.12043v2.pdf","comment":"ICLR 2023, 16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2308.03448v1","updated":"2023-08-07T10:09:11Z","published":"2023-08-07T10:09:11Z","title":"Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for\n  RAW Denoising","summary":"  Calibration-based methods have dominated RAW image denoising under extremely\nlow-light environments. However, these methods suffer from several main\ndeficiencies: 1) the calibration procedure is laborious and time-consuming, 2)\ndenoisers for different cameras are difficult to transfer, and 3) the\ndiscrepancy between synthetic noise and real noise is enlarged by high digital\ngain. To overcome the above shortcomings, we propose a calibration-free\npipeline for Lighting Every Drakness (LED), regardless of the digital gain or\ncamera sensor. Instead of calibrating the noise parameters and training\nrepeatedly, our method could adapt to a target camera only with few-shot paired\ndata and fine-tuning. In addition, well-designed structural modification during\nboth stages alleviates the domain gap between synthetic and real noise without\nany extra computational cost. With 2 pairs for each additional digital gain (in\ntotal 6 pairs) and 0.5% iterations, our method achieves superior performance\nover other calibration-based methods. Our code is available at\nhttps://github.com/Srameo/LED .\n","authors":["Xin Jin","Jia-Wen Xiao","Ling-Hao Han","Chunle Guo","Ruixun Zhang","Xialei Liu","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.03448v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2306.09780v2","updated":"2023-08-07T09:25:55Z","published":"2023-06-16T11:33:47Z","title":"Understanding Deep Generative Models with Generalized Empirical\n  Likelihoods","summary":"  Understanding how well a deep generative model captures a distribution of\nhigh-dimensional data remains an important open challenge. It is especially\ndifficult for certain model classes, such as Generative Adversarial Networks\nand Diffusion Models, whose models do not admit exact likelihoods. In this\nwork, we demonstrate that generalized empirical likelihood (GEL) methods offer\na family of diagnostic tools that can identify many deficiencies of deep\ngenerative models (DGMs). We show, with appropriate specification of moment\nconditions, that the proposed method can identify which modes have been\ndropped, the degree to which DGMs are mode imbalanced, and whether DGMs\nsufficiently capture intra-class diversity. We show how to combine techniques\nfrom Maximum Mean Discrepancy and Generalized Empirical Likelihood to create\nnot only distribution tests that retain per-sample interpretability, but also\nmetrics that include label information. We find that such tests predict the\ndegree of mode dropping and mode imbalance up to 60% better than metrics such\nas improved precision/recall. We provide an implementation at\nhttps://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.\n","authors":["Suman Ravuri","Mélanie Rey","Shakir Mohamed","Marc Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2306.09780v2.pdf","comment":"Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of\n  submissions)"},{"id":"http://arxiv.org/abs/2308.03413v1","updated":"2023-08-07T09:03:35Z","published":"2023-08-07T09:03:35Z","title":"GaFET: Learning Geometry-aware Facial Expression Translation from\n  In-The-Wild Images","summary":"  While current face animation methods can manipulate expressions individually,\nthey suffer from several limitations. The expressions manipulated by some\nmotion-based facial reenactment models are crude. Other ideas modeled with\nfacial action units cannot generalize to arbitrary expressions not covered by\nannotations. In this paper, we introduce a novel Geometry-aware Facial\nExpression Translation (GaFET) framework, which is based on parametric 3D\nfacial representations and can stably decoupled expression. Among them, a\nMulti-level Feature Aligned Transformer is proposed to complement non-geometric\nfacial detail features while addressing the alignment challenge of spatial\nfeatures. Further, we design a De-expression model based on StyleGAN, in order\nto reduce the learning difficulty of GaFET in unpaired \"in-the-wild\" images.\nExtensive qualitative and quantitative experiments demonstrate that we achieve\nhigher-quality and more accurate facial expression transfer results compared to\nstate-of-the-art methods, and demonstrate applicability of various poses and\ncomplex textures. Besides, videos or annotated training data are omitted,\nmaking our method easier to use and generalize.\n","authors":["Tianxiang Ma","Bingchuan Li","Qian He","Jing Dong","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2308.03413v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03411v1","updated":"2023-08-07T09:02:26Z","published":"2023-08-07T09:02:26Z","title":"A Horse with no Labels: Self-Supervised Horse Pose Estimation from\n  Unlabelled Images and Synthetic Prior","summary":"  Obtaining labelled data to train deep learning methods for estimating animal\npose is challenging. Recently, synthetic data has been widely used for pose\nestimation tasks, but most methods still rely on supervised learning paradigms\nutilising synthetic images and labels. Can training be fully unsupervised? Is a\ntiny synthetic dataset sufficient? What are the minimum assumptions that we\ncould make for estimating animal pose? Our proposal addresses these questions\nthrough a simple yet effective self-supervised method that only assumes the\navailability of unlabelled images and a small set of synthetic 2D poses. We\ncompletely remove the need for any 3D or 2D pose annotations (or complex 3D\nanimal models), and surprisingly our approach can still learn accurate 3D and\n2D poses simultaneously. We train our method with unlabelled images of horses\nmainly collected for YouTube videos and a prior consisting of 2D synthetic\nposes. The latter is three times smaller than the number of images needed for\ntraining. We test our method on a challenging set of horse images and evaluate\nthe predicted 3D and 2D poses. We demonstrate that it is possible to learn\naccurate animal poses even with as few assumptions as unlabelled images and a\nsmall set of 2D poses generated from synthetic data. Given the minimum\nrequirements and the abundance of unlabelled data, our method could be easily\ndeployed to different animals.\n","authors":["Jose Sosa","David Hogg"],"pdf_url":"https://arxiv.org/pdf/2308.03411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03409v1","updated":"2023-08-07T08:55:48Z","published":"2023-08-07T08:55:48Z","title":"DiT: Efficient Vision Transformers with Dynamic Token Routing","summary":"  Recently, the tokens of images share the same static data flow in many dense\nnetworks. However, challenges arise from the variance among the objects in\nimages, such as large variations in the spatial scale and difficulties of\nrecognition for visual entities. In this paper, we propose a data-dependent\ntoken routing strategy to elaborate the routing paths of image tokens for\nDynamic Vision Transformer, dubbed DiT. The proposed framework generates a\ndata-dependent path per token, adapting to the object scales and visual\ndiscrimination of tokens. In feed-forward, the differentiable routing gates are\ndesigned to select the scaling paths and feature transformation paths for image\ntokens, leading to multi-path feature propagation. In this way, the impact of\nobject scales and visual discrimination of image representation can be\ncarefully tuned. Moreover, the computational cost can be further reduced by\ngiving budget constraints to the routing gate and early-stopping of feature\nextraction. In experiments, our DiT achieves superior performance and favorable\ncomplexity/accuracy trade-offs than many SoTA methods on ImageNet\nclassification, object detection, instance segmentation, and semantic\nsegmentation. Particularly, the DiT-B5 obtains 84.8\\% top-1 Acc on ImageNet\nwith 10.3 GFLOPs, which is 1.0\\% higher than that of the SoTA method with\nsimilar computational complexity. These extensive results demonstrate that DiT\ncan serve as versatile backbones for various vision tasks.\n","authors":["Yuchen Ma","Zhengcong Fei","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03407v1","updated":"2023-08-07T08:48:46Z","published":"2023-08-07T08:48:46Z","title":"Spatially Varying Nanophotonic Neural Networks","summary":"  The explosive growth of computation and energy cost of artificial\nintelligence has spurred strong interests in new computing modalities as\npotential alternatives to conventional electronic processors. Photonic\nprocessors that execute operations using photons instead of electrons, have\npromised to enable optical neural networks with ultra-low latency and power\nconsumption. However, existing optical neural networks, limited by the\nunderlying network designs, have achieved image recognition accuracy much lower\nthan state-of-the-art electronic neural networks. In this work, we close this\ngap by introducing a large-kernel spatially-varying convolutional neural\nnetwork learned via low-dimensional reparameterization techniques. We\nexperimentally instantiate the network with a flat meta-optical system that\nencompasses an array of nanophotonic structures designed to induce\nangle-dependent responses. Combined with an extremely lightweight electronic\nbackend with approximately 2K parameters we demonstrate a nanophotonic neural\nnetwork reaches 73.80\\% blind test classification accuracy on CIFAR-10 dataset,\nand, as such, the first time, an optical neural network outperforms the first\nmodern digital neural network -- AlexNet (72.64\\%) with 57M parameters,\nbringing optical neural network into modern deep learning era.\n","authors":["Kaixuan Wei","Xiao Li","Johannes Froech","Praneeth Chakravarthula","James Whitehead","Ethan Tseng","Arka Majumdar","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2308.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08283v3","updated":"2023-08-07T08:32:54Z","published":"2022-12-16T05:10:09Z","title":"SceneGATE: Scene-Graph based co-Attention networks for TExt visual\n  question answering","summary":"  Most TextVQA approaches focus on the integration of objects, scene texts and\nquestion words by a simple transformer encoder. But this fails to capture the\nsemantic relations between different modalities. The paper proposes a Scene\nGraph based co-Attention Network (SceneGATE) for TextVQA, which reveals the\nsemantic relations among the objects, Optical Character Recognition (OCR)\ntokens and the question words. It is achieved by a TextVQA-based scene graph\nthat discovers the underlying semantics of an image. We created a\nguided-attention module to capture the intra-modal interplay between the\nlanguage and the vision as a guidance for inter-modal interactions. To make\nexplicit teaching of the relations between the two modalities, we proposed and\nintegrated two attention modules, namely a scene graph-based semantic\nrelation-aware attention and a positional relation-aware attention. We\nconducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.\nIt is shown that our SceneGATE method outperformed existing ones because of the\nscene graph and its attention modules.\n","authors":["Feiqi Cao","Siwen Luo","Felipe Nunez","Zean Wen","Josiah Poon","Caren Han"],"pdf_url":"https://arxiv.org/pdf/2212.08283v3.pdf","comment":"Published in Robotics (Q1, SCI indexed Journal):\n  https://www.mdpi.com/2218-6581/12/4/114"},{"id":"http://arxiv.org/abs/2307.13294v2","updated":"2023-08-07T08:12:57Z","published":"2023-07-25T07:20:21Z","title":"Imperceptible Physical Attack against Face Recognition Systems via LED\n  Illumination Modulation","summary":"  Although face recognition starts to play an important role in our daily life,\nwe need to pay attention that data-driven face recognition vision systems are\nvulnerable to adversarial attacks. However, the current two categories of\nadversarial attacks, namely digital attacks and physical attacks both have\ndrawbacks, with the former ones impractical and the latter one conspicuous,\nhigh-computational and inexecutable. To address the issues, we propose a\npractical, executable, inconspicuous and low computational adversarial attack\nbased on LED illumination modulation. To fool the systems, the proposed attack\ngenerates imperceptible luminance changes to human eyes through fast intensity\nmodulation of scene LED illumination and uses the rolling shutter effect of\nCMOS image sensors in face recognition systems to implant luminance information\nperturbation to the captured face images. In summary,we present a\ndenial-of-service (DoS) attack for face detection and a dodging attack for face\nverification. We also evaluate their effectiveness against well-known face\ndetection models, Dlib, MTCNN and RetinaFace , and face verification models,\nDlib, FaceNet,and ArcFace.The extensive experiments show that the success rates\nof DoS attacks against face detection models reach 97.67%, 100%, and 100%,\nrespectively, and the success rates of dodging attacks against all face\nverification models reach 100%.\n","authors":["Junbin Fang","Canjian Jiang","You Jiang","Puxi Lin","Zhaojie Chen","Yujing Sun","Siu-Ming Yiu","Zoe L. Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.13294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09036v2","updated":"2023-08-07T08:10:55Z","published":"2023-03-16T02:18:41Z","title":"Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation","summary":"  Generating images with both photorealism and multiview 3D consistency is\ncrucial for 3D-aware GANs, yet existing methods struggle to achieve them\nsimultaneously. Improving the photorealism via CNN-based 2D super-resolution\ncan break the strict 3D consistency, while keeping the 3D consistency by\nlearning high-resolution 3D representations for direct rendering often\ncompromises image quality. In this paper, we propose a novel learning strategy,\nnamely 3D-to-2D imitation, which enables a 3D-aware GAN to generate\nhigh-quality images while maintaining their strict 3D consistency, by letting\nthe images synthesized by the generator's 3D rendering branch to mimic those\ngenerated by its 2D super-resolution branch. We also introduce 3D-aware\nconvolutions into the generator for better 3D representation learning, which\nfurther improves the image generation quality. With the above strategies, our\nmethod reaches FID scores of 5.4 and 4.3 on FFHQ and AFHQ-v2 Cats,\nrespectively, at 512x512 resolution, largely outperforming existing 3D-aware\nGANs using direct 3D rendering and coming very close to the previous\nstate-of-the-art method that leverages 2D super-resolution. Project website:\nhttps://seanchenxy.github.io/Mimic3DWeb.\n","authors":["Xingyu Chen","Yu Deng","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v1","updated":"2023-08-07T08:03:20Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v1.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2308.01661v3","updated":"2023-08-07T08:00:36Z","published":"2023-08-03T09:56:31Z","title":"BEVControl: Accurately Controlling Street-view Elements with\n  Multi-perspective Consistency via BEV Sketch Layout","summary":"  Using synthesized images to boost the performance of perception models is a\nlong-standing research challenge in computer vision. It becomes more eminent in\nvisual-centric autonomous driving systems with multi-view cameras as some\nlong-tail scenarios can never be collected. Guided by the BEV segmentation\nlayouts, the existing generative networks seem to synthesize photo-realistic\nstreet-view images when evaluated solely on scene-level metrics. However, once\nzoom-in, they usually fail to produce accurate foreground and background\ndetails such as heading. To this end, we propose a two-stage generative method,\ndubbed BEVControl, that can generate accurate foreground and background\ncontents. In contrast to segmentation-like input, it also supports sketch style\ninput, which is more flexible for humans to edit. In addition, we propose a\ncomprehensive multi-level evaluation protocol to fairly compare the quality of\nthe generated scene, foreground object, and background geometry. Our extensive\nexperiments show that our BEVControl surpasses the state-of-the-art method,\nBEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation\nmIoU. In addition, we show that using images generated by BEVControl to train\nthe downstream perception model, it achieves on average 1.29 improvement in NDS\nscore.\n","authors":["Kairui Yang","Enhui Ma","Jibin Peng","Qing Guo","Di Lin","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01661v3.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.03381v1","updated":"2023-08-07T07:59:56Z","published":"2023-08-07T07:59:56Z","title":"Bilevel Generative Learning for Low-Light Vision","summary":"  Recently, there has been a growing interest in constructing deep learning\nschemes for Low-Light Vision (LLV). Existing techniques primarily focus on\ndesigning task-specific and data-dependent vision models on the standard RGB\ndomain, which inherently contain latent data associations. In this study, we\npropose a generic low-light vision solution by introducing a generative block\nto convert data from the RAW to the RGB domain. This novel approach connects\ndiverse vision problems by explicitly depicting data generation, which is the\nfirst in the field. To precisely characterize the latent correspondence between\nthe generative procedure and the vision task, we establish a bilevel model with\nthe parameters of the generative block defined as the upper level and the\nparameters of the vision task defined as the lower level. We further develop\ntwo types of learning strategies targeting different goals, namely low cost and\nhigh accuracy, to acquire a new bilevel generative learning paradigm. The\ngenerative blocks embrace a strong generalization ability in other low-light\nvision tasks through the bilevel optimization on enhancement tasks. Extensive\nexperimental evaluations on three representative low-light vision tasks, namely\nenhancement, detection, and segmentation, fully demonstrate the superiority of\nour proposed approach. The code will be available at\nhttps://github.com/Yingchi1998/BGL.\n","authors":["Yingchi Liu","Zhu Liu","Long Ma","Jinyuan Liu","Xin Fan","Zhongxuan Luo","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03381v1.pdf","comment":"Accepted by ACM MM'2023, The code will be available at\n  https://github.com/Yingchi1998/BGL"},{"id":"http://arxiv.org/abs/2308.03375v1","updated":"2023-08-07T07:54:32Z","published":"2023-08-07T07:54:32Z","title":"VR-based body tracking to stimulate musculoskeletal training","summary":"  Training helps to maintain and improve sufficient muscle function, body\ncontrol, and body coordination. These are important to reduce the risk of\nfracture incidents caused by falls, especially for the elderly or people\nrecovering from injury. Virtual reality training can offer a cost-effective and\nindividualized training experience. We present an application for the HoloLens\n2 to enable musculoskeletal training for elderly and impaired persons to allow\nfor autonomous training and automatic progress evaluation. We designed a\nvirtual downhill skiing scenario that is controlled by body movement to\nstimulate balance and body control. By adapting the parameters of the ski\nslope, we can tailor the intensity of the training to individual users. In this\nwork, we evaluate whether the movement data of the HoloLens 2 alone is\nsufficient to control and predict body movement and joint angles during\nmusculoskeletal training. We record the movements of 10 healthy volunteers with\nexternal tracking cameras and track a set of body and joint angles of the\nparticipant during training. We estimate correlation coefficients and\nsystematically analyze whether whole body movement can be derived from the\nmovement data of the HoloLens 2. No participant reports movement sickness\neffects and all were able to quickly interact and control their movement during\nskiing. Our results show a high correlation between HoloLens 2 movement data\nand the external tracking of the upper body movement and joint angles of the\nlower limbs.\n","authors":["M. Neidhardt","S. Gerlach F. N. Schmidt","I. A. K. Fiedler","S. Grube","B. Busse","A. Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2308.03375v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2308.03374v1","updated":"2023-08-07T07:53:39Z","published":"2023-08-07T07:53:39Z","title":"Heterogeneous Forgetting Compensation for Class-Incremental Learning","summary":"  Class-incremental learning (CIL) has achieved remarkable successes in\nlearning new classes consecutively while overcoming catastrophic forgetting on\nold categories. However, most existing CIL methods unreasonably assume that all\nold categories have the same forgetting pace, and neglect negative influence of\nforgetting heterogeneity among different old classes on forgetting\ncompensation. To surmount the above challenges, we develop a novel\nHeterogeneous Forgetting Compensation (HFC) model, which can resolve\nheterogeneous forgetting of easy-to-forget and hard-to-forget old categories\nfrom both representation and gradient aspects. Specifically, we design a\ntask-semantic aggregation block to alleviate heterogeneous forgetting from\nrepresentation aspect. It aggregates local category information within each\ntask to learn task-shared global representations. Moreover, we develop two\nnovel plug-and-play losses: a gradient-balanced forgetting compensation loss\nand a gradient-balanced relation distillation loss to alleviate forgetting from\ngradient aspect. They consider gradient-balanced compensation to rectify\nforgetting heterogeneity of old categories and heterogeneous relation\nconsistency. Experiments on several representative datasets illustrate\neffectiveness of our HFC model. The code is available at\nhttps://github.com/JiahuaDong/HFC.\n","authors":["Jiahua Dong","Wenqi Liang","Yang Cong","Gan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.03374v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2307.13925v3","updated":"2023-08-07T07:40:39Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":"  3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03364v1","updated":"2023-08-07T07:39:39Z","published":"2023-08-07T07:39:39Z","title":"Dual Aggregation Transformer for Image Super-Resolution","summary":"  Transformer has recently gained considerable popularity in low-level vision\ntasks, including image super-resolution (SR). These networks utilize\nself-attention along different dimensions, spatial or channel, and achieve\nimpressive performance. This inspires us to combine the two dimensions in\nTransformer for a more powerful representation capability. Based on the above\nidea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),\nfor image SR. Our DAT aggregates features across spatial and channel\ndimensions, in the inter-block and intra-block dual manner. Specifically, we\nalternately apply spatial and channel self-attention in consecutive Transformer\nblocks. The alternate strategy enables DAT to capture the global context and\nrealize inter-block feature aggregation. Furthermore, we propose the adaptive\ninteraction module (AIM) and the spatial-gate feed-forward network (SGFN) to\nachieve intra-block feature aggregation. AIM complements two self-attention\nmechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional\nnon-linear spatial information in the feed-forward network. Extensive\nexperiments show that our DAT surpasses current methods. Code and models are\nobtainable at https://github.com/zhengchen1999/DAT.\n","authors":["Zheng Chen","Yulun Zhang","Jinjin Gu","Linghe Kong","Xiaokang Yang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03364v1.pdf","comment":"Accepted to ICCV 2023. Code is available at\n  https://github.com/zhengchen1999/DAT"},{"id":"http://arxiv.org/abs/2308.03359v1","updated":"2023-08-07T07:28:24Z","published":"2023-08-07T07:28:24Z","title":"Distortion-aware Transformer in 360° Salient Object Detection","summary":"  With the emergence of VR and AR, 360{\\deg} data attracts increasing attention\nfrom the computer vision and multimedia communities. Typically, 360{\\deg} data\nis projected into 2D ERP (equirectangular projection) images for feature\nextraction. However, existing methods cannot handle the distortions that result\nfrom the projection, hindering the development of 360-data-based tasks.\nTherefore, in this paper, we propose a Transformer-based model called DATFormer\nto address the distortion problem. We tackle this issue from two perspectives.\nFirstly, we introduce two distortion-adaptive modules. The first is a\nDistortion Mapping Module, which guides the model to pre-adapt to distorted\nfeatures globally. The second module is a Distortion-Adaptive Attention Block\nthat reduces local distortions on multi-scale features. Secondly, to exploit\nthe unique characteristics of 360{\\deg} data, we present a learnable relation\nmatrix and use it as part of the positional embedding to further improve\nperformance. Extensive experiments are conducted on three public datasets, and\nthe results show that our model outperforms existing 2D SOD (salient object\ndetection) and 360 SOD methods.\n","authors":["Yinjie Zhao","Lichen Zhao","Qian Yu","Jing Zhang","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03359v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.03532v2","updated":"2023-08-07T07:25:34Z","published":"2023-04-07T08:11:16Z","title":"Graph-Guided MLP-Mixer for Skeleton-Based Human Motion Prediction","summary":"  In recent years, Graph Convolutional Networks (GCNs) have been widely used in\nhuman motion prediction, but their performance remains unsatisfactory.\nRecently, MLP-Mixer, initially developed for vision tasks, has been leveraged\ninto human motion prediction as a promising alternative to GCNs, which achieves\nboth better performance and better efficiency than GCNs. Unlike GCNs, which can\nexplicitly capture human skeleton's bone-joint structure by representing it as\na graph with edges and nodes, MLP-Mixer relies on fully connected layers and\nthus cannot explicitly model such graph-like structure of human's. To break\nthis limitation of MLP-Mixer's, we propose \\textit{Graph-Guided Mixer}, a novel\napproach that equips the original MLP-Mixer architecture with the capability to\nmodel graph structure. By incorporating graph guidance, our\n\\textit{Graph-Guided Mixer} can effectively capture and utilize the specific\nconnectivity patterns within human skeleton's graph representation. In this\npaper, first we uncover a theoretical connection between MLP-Mixer and GCN that\nis unexplored in existing research. Building on this theoretical connection,\nnext we present our proposed \\textit{Graph-Guided Mixer}, explaining how the\noriginal MLP-Mixer architecture is reinvented to incorporate guidance from\ngraph structure. Then we conduct an extensive evaluation on the Human3.6M,\nAMASS, and 3DPW datasets, which shows that our method achieves state-of-the-art\nperformance.\n","authors":["Xinshun Wang","Qiongjie Cui","Chen Chen","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2304.03532v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03354v1","updated":"2023-08-07T07:23:43Z","published":"2023-08-07T07:23:43Z","title":"Energy-Guided Diffusion Model for CBCT-to-CT Synthesis","summary":"  Cone Beam CT (CBCT) plays a crucial role in Adaptive Radiation Therapy (ART)\nby accurately providing radiation treatment when organ anatomy changes occur.\nHowever, CBCT images suffer from scatter noise and artifacts, making relying\nsolely on CBCT for precise dose calculation and accurate tissue localization\nchallenging. Therefore, there is a need to improve CBCT image quality and\nHounsfield Unit (HU) accuracy while preserving anatomical structures. To\nenhance the role and application value of CBCT in ART, we propose an\nenergy-guided diffusion model (EGDiff) and conduct experiments on a chest tumor\ndataset to generate synthetic CT (sCT) from CBCT. The experimental results\ndemonstrate impressive performance with an average absolute error of\n26.87$\\pm$6.14 HU, a structural similarity index measurement of 0.850$\\pm$0.03,\na peak signal-to-noise ratio of the sCT of 19.83$\\pm$1.39 dB, and a normalized\ncross-correlation of the sCT of 0.874$\\pm$0.04. These results indicate that our\nmethod outperforms state-of-the-art unsupervised synthesis methods in accuracy\nand visual quality, producing superior sCT images.\n","authors":["Linjie Fu","Xia Li","Xiuding Cai","Dong Miao","Yu Yao","Yali Shen"],"pdf_url":"https://arxiv.org/pdf/2308.03354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03349v1","updated":"2023-08-07T07:03:49Z","published":"2023-08-07T07:03:49Z","title":"SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering\n  Dataset for Scientific Graphs","summary":"  In this work, we present SciGraphQA, a synthetic multi-turn question-answer\ndataset related to academic graphs. SciGraphQA is 13 times larger than\nChartVQA, the previously largest chart-visual question-answering dataset. It is\nalso the largest open-sourced chart VQA dataset with non-synthetic charts. To\nbuild our dataset, we selected 290,000 Computer Science or Machine Learning\nArXiv papers published between 2010 and 2020, and then used Palm-2 to generate\n295K samples of open-vocabulary multi-turn question-answering dialogues about\nthe graphs. As context, we provided the text-only Palm-2 with paper title,\nabstract, paragraph mentioning the graph, and rich text contextual data from\nthe graph itself, obtaining dialogues with an average 2.23 question-answer\nturns for each graph. We asked GPT-4 to assess the matching quality of our\nquestion-answer turns given the paper's context, obtaining an average rating of\n8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most\npopular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our\ndataset, finding LLaVA-13B being the most performant with a CIDEr score of\n0.08. We further enriched the question prompts for LLAVA by including the\nserialized data tables extracted from the graphs using the DePlot model,\nboosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,\nwe also fine-tuned LLaVa using our dataset, reaching a substantially higher\nCIDEr score of 0.26. We anticipate further accuracy improvement by including\nsegmentation mask tokens and leveraging larger LLM backbones coupled with\nemergent prompting techniques. Our code and data are open-sourced.\n","authors":["Shengzhi Li","Nima Tajbakhsh"],"pdf_url":"https://arxiv.org/pdf/2308.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03348v1","updated":"2023-08-07T07:02:42Z","published":"2023-08-07T07:02:42Z","title":"Cooperative Colorization: Exploring Latent Cross-Domain Priors for NIR\n  Image Spectrum Translation","summary":"  Near-infrared (NIR) image spectrum translation is a challenging problem with\nmany promising applications. Existing methods struggle with the mapping\nambiguity between the NIR and the RGB domains, and generalize poorly due to the\nlimitations of models' learning capabilities and the unavailability of\nsufficient NIR-RGB image pairs for training. To address these challenges, we\npropose a cooperative learning paradigm that colorizes NIR images in parallel\nwith another proxy grayscale colorization task by exploring latent cross-domain\npriors (i.e., latent spectrum context priors and task domain priors), dubbed\nCoColor. The complementary statistical and semantic spectrum information from\nthese two task domains -- in the forms of pre-trained colorization networks --\nare brought in as task domain priors. A bilateral domain translation module is\nsubsequently designed, in which intermittent NIR images are generated from\ngrayscale and colorized in parallel with authentic NIR images; and vice versa\nfor the grayscale images. These intermittent transformations act as latent\nspectrum context priors for efficient domain knowledge exchange. We\nprogressively fine-tune and fuse these modules with a series of pixel-level and\nfeature-level consistency constraints. Experiments show that our proposed\ncooperative learning framework produces satisfactory spectrum translation\noutputs with diverse colors and rich textures, and outperforms state-of-the-art\ncounterparts by 3.95dB and 4.66dB in terms of PNSR for the NIR and grayscale\ncolorization tasks, respectively.\n","authors":["Xingxing Yang","Jie Chen","Zaifeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03348v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.03340v1","updated":"2023-08-07T06:47:36Z","published":"2023-08-07T06:47:36Z","title":"A Hybrid CNN-Transformer Architecture with Frequency Domain Contrastive\n  Learning for Image Deraining","summary":"  Image deraining is a challenging task that involves restoring degraded images\naffected by rain streaks.\n","authors":["Cheng Wang","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2308.03340v1.pdf","comment":"21 pages,6 figures"},{"id":"http://arxiv.org/abs/2209.10510v2","updated":"2023-08-07T06:40:13Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v2.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2304.01198v2","updated":"2023-08-07T06:24:13Z","published":"2023-04-03T17:59:21Z","title":"Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network","summary":"  Recently, the open-vocabulary semantic segmentation problem has attracted\nincreasing attention and the best performing methods are based on two-stream\nnetworks: one stream for proposal mask generation and the other for segment\nclassification using a pretrained visual-language model. However, existing\ntwo-stream methods require passing a great number of (up to a hundred) image\ncrops into the visual-language model, which is highly inefficient. To address\nthe problem, we propose a network that only needs a single pass through the\nvisual-language model for each input image. Specifically, we first propose a\nnovel network adaptation approach, termed patch severance, to restrict the\nharmful interference between the patch embeddings in the pre-trained visual\nencoder. We then propose classification anchor learning to encourage the\nnetwork to spatially focus on more discriminative features for classification.\nExtensive experiments demonstrate that the proposed method achieves outstanding\nperformance, surpassing state-of-the-art methods while being 4 to 7 times\nfaster at inference. Code: https://github.com/CongHan0808/DeOP.git\n","authors":["Cong Han","Yujie Zhong","Dengjie Li","Kai Han","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2304.01198v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03322v1","updated":"2023-08-07T06:15:51Z","published":"2023-08-07T06:15:51Z","title":"Part-Aware Transformer for Generalizable Person Re-identification","summary":"  Domain generalization person re-identification (DG-ReID) aims to train a\nmodel on source domains and generalize well on unseen domains. Vision\nTransformer usually yields better generalization ability than common CNN\nnetworks under distribution shifts. However, Transformer-based ReID models\ninevitably over-fit to domain-specific biases due to the supervised learning\nstrategy on the source domain. We observe that while the global images of\ndifferent IDs should have different features, their similar local parts (e.g.,\nblack backpack) are not bounded by this constraint. Motivated by this, we\npropose a pure Transformer model (termed Part-aware Transformer) for DG-ReID by\ndesigning a proxy task, named Cross-ID Similarity Learning (CSL), to mine local\nvisual information shared by different IDs. This proxy task allows the model to\nlearn generic features because it only cares about the visual similarity of the\nparts regardless of the ID labels, thus alleviating the side effect of\ndomain-specific biases. Based on the local similarity obtained in CSL, a\nPart-guided Self-Distillation (PSD) is proposed to further improve the\ngeneralization of global features. Our method achieves state-of-the-art\nperformance under most DG ReID settings. Under the Market$\\to$Duke setting, our\nmethod exceeds state-of-the-art by 10.9% and 12.8% in Rank1 and mAP,\nrespectively. The code is available at\nhttps://github.com/liyuke65535/Part-Aware-Transformer.\n","authors":["Hao Ni","Yuke Li","Heng Tao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.03322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03321v1","updated":"2023-08-07T06:08:51Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework","summary":"  The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2304.01199v2","updated":"2023-08-07T05:07:20Z","published":"2023-04-03T17:59:49Z","title":"On the Benefits of 3D Pose and Tracking for Human Action Recognition","summary":"  In this work we study the benefits of using tracking and 3D poses for action\nrecognition. To achieve this, we take the Lagrangian view on analysing actions\nover a trajectory of human motion rather than at a fixed point in space. Taking\nthis stand allows us to use the tracklets of people to predict their actions.\nIn this spirit, first we show the benefits of using 3D pose to infer actions,\nand study person-person interactions. Subsequently, we propose a Lagrangian\nAction Recognition model by fusing 3D pose and contextualized appearance over\ntracklets. To this end, our method achieves state-of-the-art performance on the\nAVA v2.2 dataset on both pose only settings and on standard benchmark settings.\nWhen reasoning about the action using only pose cues, our pose model achieves\n+10.0 mAP gain over the corresponding state-of-the-art while our fused model\nhas a gain of +2.8 mAP over the best state-of-the-art model. Code and results\nare available at: https://brjathu.github.io/LART\n","authors":["Jathushan Rajasegaran","Georgios Pavlakos","Angjoo Kanazawa","Christoph Feichtenhofer","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2304.01199v2.pdf","comment":"CVPR2023 (project page: https://brjathu.github.io/LART)"},{"id":"http://arxiv.org/abs/2106.14490v3","updated":"2023-08-07T04:47:05Z","published":"2021-06-28T09:09:14Z","title":"Making Images Real Again: A Comprehensive Survey on Deep Image\n  Composition","summary":"  As a common image editing operation, image composition aims to combine the\nforeground from one image and another background image, resulting in a\ncomposite image. However, there are many issues that could make the composite\nimages unrealistic. These issues can be summarized as the inconsistency between\nforeground and background, which includes appearance inconsistency (e.g.,\nincompatible illumination), geometry inconsistency (e.g., unreasonable size),\nand semantic inconsistency (e.g., mismatched semantic context). Image\ncomposition task could be decomposed into multiple sub-tasks, in which each\nsub-task targets at one or more issues. Specifically, object placement aims to\nfind reasonable scale, location, and shape for the foreground. Image blending\naims to address the unnatural boundary between foreground and background. Image\nharmonization aims to adjust the illumination statistics of foreground. Shadow\ngeneration aims to generate plausible shadow for the foreground. These\nsub-tasks can be executed sequentially or parallelly to acquire realistic\ncomposite images. To the best of our knowledge, there is no previous survey on\nimage composition. In this paper, we conduct comprehensive survey over the\nsub-tasks and combinatorial task of image composition. For each one, we\nsummarize the existing methods, available datasets, and common evaluation\nmetrics. Datasets and codes for image composition are summarized at\nhttps://github.com/bcmi/Awesome-Image-Composition.\n","authors":["Li Niu","Wenyan Cong","Liu Liu","Yan Hong","Bo Zhang","Jing Liang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2106.14490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16415v2","updated":"2023-08-07T04:29:12Z","published":"2023-07-31T05:48:39Z","title":"DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised\n  Temporal Action Localization","summary":"  Weakly-supervised temporal action localization (WTAL) is a practical yet\nchallenging task. Due to large-scale datasets, most existing methods use a\nnetwork pretrained in other datasets to extract features, which are not\nsuitable enough for WTAL. To address this problem, researchers design several\nmodules for feature enhancement, which improve the performance of the\nlocalization module, especially modeling the temporal relationship between\nsnippets. However, all of them neglect the adverse effects of ambiguous\ninformation, which would reduce the discriminability of others. Considering\nthis phenomenon, we propose Discriminability-Driven Graph Network (DDG-Net),\nwhich explicitly models ambiguous snippets and discriminative snippets with\nwell-designed connections, preventing the transmission of ambiguous information\nand enhancing the discriminability of snippet-level representations.\nAdditionally, we propose feature consistency loss to prevent the assimilation\nof features and drive the graph convolution network to generate more\ndiscriminative representations. Extensive experiments on THUMOS14 and\nActivityNet1.2 benchmarks demonstrate the effectiveness of DDG-Net,\nestablishing new state-of-the-art results on both datasets. Source code is\navailable at \\url{https://github.com/XiaojunTang22/ICCV2023-DDGNet}.\n","authors":["Xiaojun Tang","Junsong Fan","Chuanchen Luo","Zhaoxiang Zhang","Man Zhang","Zongyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2307.16415v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03290v1","updated":"2023-08-07T04:17:19Z","published":"2023-08-07T04:17:19Z","title":"FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization\n  Search","summary":"  Quantization has become a mainstream compression technique for reducing model\nsize, computational requirements, and energy consumption for modern deep neural\nnetworks (DNNs). With the improved numerical support in recent hardware,\nincluding multiple variants of integer and floating point, mixed-precision\nquantization has become necessary to achieve high-quality results with low\nmodel cost. Prior mixed-precision quantization methods have performed a\npost-training quantization search, which compromises on accuracy, or a\ndifferentiable quantization search, which leads to high memory usage from\nbranching. Therefore, we propose the first one-shot mixed-precision\nquantization search that eliminates the need for retraining in both integer and\nlow-precision floating point models. We evaluate our floating-point and integer\nquantization search (FLIQS) on multiple convolutional networks and vision\ntransformer models to discover Pareto-optimal models. Our approach discovers\nmodels that improve upon uniform precision, manual mixed-precision, and recent\ninteger quantization search methods. With the proposed integer quantization\nsearch, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and\nResNet-50 by 0.90% points with equivalent model cost over previous methods.\nAdditionally, for the first time, we explore a novel mixed-precision\nfloating-point search and improve MobileNetV2 by up to 0.98% points compared to\nprior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously\nsearch a joint quantization and neural architecture space and improve the\nImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2\nsearch space.\n","authors":["Jordan Dotzel","Gang Wu","Andrew Li","Muhammad Umar","Yun Ni","Mohamed S. Abdelfattah","Zhiru Zhang","Liqun Cheng","Martin G. Dixon","Norman P. Jouppi","Quoc V. Le","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.03290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03286v1","updated":"2023-08-07T04:04:22Z","published":"2023-08-07T04:04:22Z","title":"Multi-Label Self-Supervised Learning with Scene Images","summary":"  Self-supervised learning (SSL) methods targeting scene images have seen a\nrapid growth recently, and they mostly rely on either a dedicated dense\nmatching mechanism or a costly unsupervised object discovery module. This paper\nshows that instead of hinging on these strenuous operations, quality image\nrepresentations can be learned by treating scene/multi-label image SSL simply\nas a multi-label classification problem, which greatly simplifies the learning\nframework. Specifically, multiple binary pseudo-labels are assigned for each\ninput image by comparing its embeddings with those in two dictionaries, and the\nnetwork is optimized using the binary cross entropy loss. The proposed method\nis named Multi-Label Self-supervised learning (MLS). Visualizations\nqualitatively show that clearly the pseudo-labels by MLS can automatically find\nsemantically similar pseudo-positive pairs across different images to\nfacilitate contrastive learning. MLS learns high quality representations on\nMS-COCO and achieves state-of-the-art results on classification, detection and\nsegmentation benchmarks. At the same time, MLS is much simpler than existing\nmethods, making it easier to deploy and for further exploration.\n","authors":["Ke Zhu","Minghao Fu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03282v1","updated":"2023-08-07T03:56:15Z","published":"2023-08-07T03:56:15Z","title":"Environment-Invariant Curriculum Relation Learning for Fine-Grained\n  Scene Graph Generation","summary":"  The scene graph generation (SGG) task is designed to identify the predicates\nbased on the subject-object pairs.However,existing datasets generally include\ntwo imbalance cases: one is the class imbalance from the predicted predicates\nand another is the context imbalance from the given subject-object pairs, which\npresents significant challenges for SGG. Most existing methods focus on the\nimbalance of the predicted predicate while ignoring the imbalance of the\nsubject-object pairs, which could not achieve satisfactory results. To address\nthe two imbalance cases, we propose a novel Environment Invariant Curriculum\nRelation learning (EICR) method, which can be applied in a plug-and-play\nfashion to existing SGG methods. Concretely, to remove the imbalance of the\nsubject-object pairs, we first construct different distribution environments\nfor the subject-object pairs and learn a model invariant to the environment\nchanges. Then, we construct a class-balanced curriculum learning strategy to\nbalance the different environments to remove the predicate imbalance.\nComprehensive experiments conducted on VG and GQA datasets demonstrate that our\nEICR framework can be taken as a general strategy for various SGG models, and\nachieve significant improvements.\n","authors":["Yukuan Min","Aming Wu","Cheng Deng"],"pdf_url":"https://arxiv.org/pdf/2308.03282v1.pdf","comment":"ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by\n  other authors"},{"id":"http://arxiv.org/abs/2308.03280v1","updated":"2023-08-07T03:48:07Z","published":"2023-08-07T03:48:07Z","title":"Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with\n  Whitted-Style Ray Tracing","summary":"  Recently, Neural Radiance Fields (NeRF) has exhibited significant success in\nnovel view synthesis, surface reconstruction, etc. However, since no physical\nreflection is considered in its rendering pipeline, NeRF mistakes the\nreflection in the mirror as a separate virtual scene, leading to the inaccurate\nreconstruction of the mirror and multi-view inconsistent reflections in the\nmirror. In this paper, we present a novel neural rendering framework, named\nMirror-NeRF, which is able to learn accurate geometry and reflection of the\nmirror and support various scene manipulation applications with mirrors, such\nas adding new objects or mirrors into the scene and synthesizing the\nreflections of these new objects in mirrors, controlling mirror roughness, etc.\nTo achieve this goal, we propose a unified radiance field by introducing the\nreflection probability and tracing rays following the light transport model of\nWhitted Ray Tracing, and also develop several techniques to facilitate the\nlearning process. Experiments and comparisons on both synthetic and real\ndatasets demonstrate the superiority of our method. The code and supplementary\nmaterial are available on the project webpage:\nhttps://zju3dv.github.io/Mirror-NeRF/.\n","authors":["Junyi Zeng","Chong Bao","Rui Chen","Zilong Dong","Guofeng Zhang","Hujun Bao","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2308.03280v1.pdf","comment":"Accepted to ACM Multimedia 2023. Project Page:\n  https://zju3dv.github.io/Mirror-NeRF/"},{"id":"http://arxiv.org/abs/2308.03276v1","updated":"2023-08-07T03:35:47Z","published":"2023-08-07T03:35:47Z","title":"Spatialyze: A Geospatial Video Analytics System with Spatial-Aware\n  Optimizations","summary":"  Videos that are shot using commodity hardware such as phones and surveillance\ncameras record various metadata such as time and location. We encounter such\ngeospatial videos on a daily basis and such videos have been growing in volume\nsignificantly. Yet, we do not have data management systems that allow users to\ninteract with such data effectively.\n  In this paper, we describe Spatialyze, a new framework for end-to-end\nquerying of geospatial videos. Spatialyze comes with a domain-specific language\nwhere users can construct geospatial video analytic workflows using a 3-step,\ndeclarative, build-filter-observe paradigm. Internally, Spatialyze leverages\nthe declarative nature of such workflows, the temporal-spatial metadata stored\nwith videos, and physical behavior of real-world objects to optimize the\nexecution of workflows. Our results using real-world videos and workflows show\nthat Spatialyze can reduce execution time by up to 5.3x, while maintaining up\nto 97.1% accuracy compared to unoptimized execution.\n","authors":["Chanwut Kittivorawong","Yongming Ge","Yousef Helal","Alvin Cheung"],"pdf_url":"https://arxiv.org/pdf/2308.03276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03272v1","updated":"2023-08-07T03:27:04Z","published":"2023-08-07T03:27:04Z","title":"Feature-Suppressed Contrast for Self-Supervised Food Pre-training","summary":"  Most previous approaches for analyzing food images have relied on extensively\nannotated datasets, resulting in significant human labeling expenses due to the\nvaried and intricate nature of such images. Inspired by the effectiveness of\ncontrastive self-supervised methods in utilizing unlabelled data, weiqing\nexplore leveraging these techniques on unlabelled food images. In contrastive\nself-supervised methods, two views are randomly generated from an image by data\naugmentations. However, regarding food images, the two views tend to contain\nsimilar informative contents, causing large mutual information, which impedes\nthe efficacy of contrastive self-supervised learning. To address this problem,\nwe propose Feature Suppressed Contrast (FeaSC) to reduce mutual information\nbetween views. As the similar contents of the two views are salient or highly\nresponsive in the feature map, the proposed FeaSC uses a response-aware scheme\nto localize salient features in an unsupervised manner. By suppressing some\nsalient features in one view while leaving another contrast view unchanged, the\nmutual information between the two views is reduced, thereby enhancing the\neffectiveness of contrast learning for self-supervised food pre-training. As a\nplug-and-play module, the proposed method consistently improves BYOL and\nSimSiam by 1.70\\% $\\sim$ 6.69\\% classification accuracy on four publicly\navailable food recognition datasets. Superior results have also been achieved\non downstream segmentation tasks, demonstrating the effectiveness of the\nproposed method.\n","authors":["Xinda Liu","Yaohui Zhu","Linhu Liu","Jiang Tian","Lili Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11418v2","updated":"2023-08-07T03:18:31Z","published":"2023-07-21T08:22:14Z","title":"FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural\n  Radiance Fields","summary":"  As recent advances in Neural Radiance Fields (NeRF) have enabled\nhigh-fidelity 3D face reconstruction and novel view synthesis, its manipulation\nalso became an essential task in 3D vision. However, existing manipulation\nmethods require extensive human labor, such as a user-provided semantic mask\nand manual attribute search unsuitable for non-expert users. Instead, our\napproach is designed to require a single text to manipulate a face\nreconstructed with NeRF. To do so, we first train a scene manipulator, a latent\ncode-conditional deformable NeRF, over a dynamic scene to control a face\ndeformation using the latent code. However, representing a scene deformation\nwith a single latent code is unfavorable for compositing local deformations\nobserved in different instances. As so, our proposed Position-conditional\nAnchor Compositor (PAC) learns to represent a manipulated scene with spatially\nvarying latent codes. Their renderings with the scene manipulator are then\noptimized to yield high cosine similarity to a target text in CLIP embedding\nspace for text-driven manipulation. To the best of our knowledge, our approach\nis the first to address the text-driven manipulation of a face reconstructed\nwith NeRF. Extensive results, comparisons, and ablation studies demonstrate the\neffectiveness of our approach.\n","authors":["Sungwon Hwang","Junha Hyung","Daejin Kim","Min-Jung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2307.11418v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03267v1","updated":"2023-08-07T03:16:24Z","published":"2023-08-07T03:16:24Z","title":"Redundancy-aware Transformer for Video Question Answering","summary":"  This paper identifies two kinds of redundancy in the current VideoQA\nparadigm. Specifically, the current video encoders tend to holistically embed\nall video clues at different granularities in a hierarchical manner, which\ninevitably introduces \\textit{neighboring-frame redundancy} that can overwhelm\ndetailed visual clues at the object level. Subsequently, prevailing\nvision-language fusion designs introduce the \\textit{cross-modal redundancy} by\nexhaustively fusing all visual elements with question tokens without explicitly\ndifferentiating their pairwise vision-language interactions, thus making a\npernicious impact on the answering.\n  To this end, we propose a novel transformer-based architecture, that aims to\nmodel VideoQA in a redundancy-aware manner. To address the neighboring-frame\nredundancy, we introduce a video encoder structure that emphasizes the\nobject-level change in neighboring frames, while adopting an out-of-neighboring\nmessage-passing scheme that imposes attention only on distant frames. As for\nthe cross-modal redundancy, we equip our fusion module with a novel adaptive\nsampling, which explicitly differentiates the vision-language interactions by\nidentifying a small subset of visual elements that exclusively support the\nanswer. Upon these advancements, we find this\n\\underline{R}edundancy-\\underline{a}ware trans\\underline{former} (RaFormer) can\nachieve state-of-the-art results on multiple VideoQA benchmarks.\n","authors":["Yicong Li","Xun Yang","An Zhang","Chun Feng","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.03267v1.pdf","comment":"Accepted to ACM MM23"},{"id":"http://arxiv.org/abs/2207.01405v4","updated":"2023-08-07T03:11:49Z","published":"2022-07-04T13:37:38Z","title":"I-ViT: Integer-only Quantization for Efficient Vision Transformer\n  Inference","summary":"  Vision Transformers (ViTs) have achieved state-of-the-art performance on\nvarious computer vision applications. However, these models have considerable\nstorage and computational overheads, making their deployment and efficient\ninference on edge devices challenging. Quantization is a promising approach to\nreducing model complexity, and the dyadic arithmetic pipeline can allow the\nquantized models to perform efficient integer-only inference. Unfortunately,\ndyadic arithmetic is based on the homogeneity condition in convolutional neural\nnetworks, which is not applicable to the non-linear components in ViTs, making\ninteger-only inference of ViTs an open issue. In this paper, we propose I-ViT,\nan integer-only quantization scheme for ViTs, to enable ViTs to perform the\nentire computational graph of inference with integer arithmetic and\nbit-shifting, and without any floating-point arithmetic. In I-ViT, linear\noperations (e.g., MatMul and Dense) follow the integer-only pipeline with\ndyadic arithmetic, and non-linear operations (e.g., Softmax, GELU, and\nLayerNorm) are approximated by the proposed light-weight integer-only\narithmetic methods. More specifically, I-ViT applies the proposed Shiftmax and\nShiftGELU, which are designed to use integer bit-shifting to approximate the\ncorresponding floating-point operations. We evaluate I-ViT on various benchmark\nmodels and the results show that integer-only INT8 quantization achieves\ncomparable (or even slightly higher) accuracy to the full-precision (FP)\nbaseline. Furthermore, we utilize TVM for practical hardware deployment on the\nGPU's integer arithmetic units, achieving 3.72$\\sim$4.11$\\times$ inference\nspeedup compared to the FP model. Code of both Pytorch and TVM is released at\nhttps://github.com/zkkli/I-ViT.\n","authors":["Zhikai Li","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2207.01405v4.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2212.08632v2","updated":"2023-08-07T03:02:06Z","published":"2022-12-16T18:12:04Z","title":"Enhancing Multi-modal and Multi-hop Question Answering via Structured\n  Knowledge and Unified Retrieval-Generation","summary":"  Multi-modal multi-hop question answering involves answering a question by\nreasoning over multiple input sources from different modalities. Existing\nmethods often retrieve evidences separately and then use a language model to\ngenerate an answer based on the retrieved evidences, and thus do not adequately\nconnect candidates and are unable to model the interdependent relations during\nretrieval. Moreover, the pipelined approaches of retrieval and generation might\nresult in poor generation performance when retrieval performance is low. To\naddress these issues, we propose a Structured Knowledge and Unified\nRetrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion\nEncoder to align sources from different modalities using shared entities. It\nthen uses a unified Retrieval-Generation Decoder to integrate intermediate\nretrieval results for answer generation and also adaptively determine the\nnumber of retrieval steps. Extensive experiments on two representative\nmulti-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG\noutperforms the state-of-the-art models in both source retrieval and answer\ngeneration performance with fewer parameters. Our code is available at\nhttps://github.com/HITsz-TMG/SKURG.\n","authors":["Qian Yang","Qian Chen","Wen Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.08632v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2212.08254v2","updated":"2023-08-07T03:00:41Z","published":"2022-12-16T02:52:37Z","title":"RepQ-ViT: Scale Reparameterization for Post-Training Quantization of\n  Vision Transformers","summary":"  Post-training quantization (PTQ), which only requires a tiny dataset for\ncalibration without end-to-end retraining, is a light and practical model\ncompression technique. Recently, several PTQ schemes for vision transformers\n(ViTs) have been presented; unfortunately, they typically suffer from\nnon-trivial accuracy degradation, especially in low-bit cases. In this paper,\nwe propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale\nreparameterization, to address the above issues. RepQ-ViT decouples the\nquantization and inference processes, where the former employs complex\nquantizers and the latter employs scale-reparameterized simplified quantizers.\nThis ensures both accurate quantization and efficient inference, which\ndistinguishes it from existing approaches that sacrifice quantization\nperformance to meet the target hardware. More specifically, we focus on two\ncomponents with extreme distributions: post-LayerNorm activations with severe\ninter-channel variation and post-Softmax activations with power-law features,\nand initially apply channel-wise quantization and log$\\sqrt{2}$ quantization,\nrespectively. Then, we reparameterize the scales to hardware-friendly\nlayer-wise quantization and log2 quantization for inference, with only slight\naccuracy or computational costs. Extensive experiments are conducted on\nmultiple vision tasks with different model variants, proving that RepQ-ViT,\nwithout hyperparameters and expensive reconstruction procedures, can outperform\nexisting strong baselines and encouragingly improve the accuracy of 4-bit PTQ\nof ViTs to a usable level. Code is available at\nhttps://github.com/zkkli/RepQ-ViT.\n","authors":["Zhikai Li","Junrui Xiao","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2212.08254v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03262v1","updated":"2023-08-07T02:57:48Z","published":"2023-08-07T02:57:48Z","title":"A Benchmark for Chinese-English Scene Text Image Super-resolution","summary":"  Scene Text Image Super-resolution (STISR) aims to recover high-resolution\n(HR) scene text images with visually pleasant and readable text content from\nthe given low-resolution (LR) input. Most existing works focus on recovering\nEnglish texts, which have relatively simple character structures, while little\nwork has been done on the more challenging Chinese texts with diverse and\ncomplex character structures. In this paper, we propose a real-world\nChinese-English benchmark dataset, namely Real-CE, for the task of STISR with\nthe emphasis on restoring structurally complex Chinese characters. The\nbenchmark provides 1,935/783 real-world LR-HR text image pairs~(contains 33,789\ntext lines in total) for training/testing in 2$\\times$ and 4$\\times$ zooming\nmodes, complemented by detailed annotations, including detection boxes and text\ntranscripts. Moreover, we design an edge-aware learning method, which provides\nstructural supervision in image and feature domains, to effectively reconstruct\nthe dense structures of Chinese characters. We conduct experiments on the\nproposed Real-CE benchmark and evaluate the existing STISR models with and\nwithout our edge-aware loss. The benchmark, including data and source code, is\navailable at https://github.com/mjq11302010044/Real-CE.\n","authors":["Jianqi Ma","Zhetong Liang","Wangmeng Xiang","Xi Yang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03262v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.02153v2","updated":"2023-08-07T02:33:21Z","published":"2023-08-04T06:20:20Z","title":"Robust Self-Supervised Extrinsic Self-Calibration","summary":"  Autonomous vehicles and robots need to operate over a wide variety of\nscenarios in order to complete tasks efficiently and safely. Multi-camera\nself-supervised monocular depth estimation from videos is a promising way to\nreason about the environment, as it generates metrically scaled geometric\npredictions from visual data without requiring additional sensors. However,\nmost works assume well-calibrated extrinsics to fully leverage this\nmulti-camera setup, even though accurate and efficient calibration is still a\nchallenging problem. In this work, we introduce a novel method for extrinsic\ncalibration that builds upon the principles of self-supervised monocular depth\nand ego-motion learning. Our proposed curriculum learning strategy uses\nmonocular depth and pose estimators with velocity supervision to estimate\nextrinsics, and then jointly learns extrinsic calibration along with depth and\npose for a set of overlapping cameras rigidly attached to a moving vehicle.\nExperiments on a benchmark multi-camera dataset (DDAD) demonstrate that our\nmethod enables self-calibration in various scenes robustly and efficiently\ncompared to a traditional vision-based pose estimation pipeline. Furthermore,\nwe demonstrate the benefits of extrinsics self-calibration as a way to improve\ndepth prediction via joint optimization.\n","authors":["Takayuki Kanai","Igor Vasiljevic","Vitor Guizilini","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2308.02153v2.pdf","comment":"Project page: https://sites.google.com/view/tri-sesc"},{"id":"http://arxiv.org/abs/2308.03258v1","updated":"2023-08-07T02:30:47Z","published":"2023-08-07T02:30:47Z","title":"APBench: A Unified Benchmark for Availability Poisoning Attacks and\n  Defenses","summary":"  The efficacy of availability poisoning, a method of poisoning data by\ninjecting imperceptible perturbations to prevent its use in model training, has\nbeen a hot subject of investigation. Previous research suggested that it was\ndifficult to effectively counteract such poisoning attacks. However, the\nintroduction of various defense methods has challenged this notion. Due to the\nrapid progress in this field, the performance of different novel methods cannot\nbe accurately validated due to variations in experimental setups. To further\nevaluate the attack and defense capabilities of these poisoning methods, we\nhave developed a benchmark -- APBench for assessing the efficacy of adversarial\npoisoning. APBench consists of 9 state-of-the-art availability poisoning\nattacks, 8 defense algorithms, and 4 conventional data augmentation techniques.\nWe also have set up experiments with varying different poisoning ratios, and\nevaluated the attacks on multiple datasets and their transferability across\nmodel architectures. We further conducted a comprehensive evaluation of 2\nadditional attacks specifically targeting unsupervised models. Our results\nreveal the glaring inadequacy of existing attacks in safeguarding individual\nprivacy. APBench is open source and available to the deep learning community:\nhttps://github.com/lafeat/apbench.\n","authors":["Tianrui Qin","Xitong Gao","Juanjuan Zhao","Kejiang Ye","Cheng-Zhong Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03256v1","updated":"2023-08-07T02:25:06Z","published":"2023-08-07T02:25:06Z","title":"Learning a Graph Neural Network with Cross Modality Interaction for\n  Image Fusion","summary":"  Infrared and visible image fusion has gradually proved to be a vital fork in\nthe field of multi-modality imaging technologies. In recent developments,\nresearchers not only focus on the quality of fused images but also evaluate\ntheir performance in downstream tasks. Nevertheless, the majority of methods\nseldom put their eyes on the mutual learning from different modalities,\nresulting in fused images lacking significant details and textures. To overcome\nthis issue, we propose an interactive graph neural network (GNN)-based\narchitecture between cross modality for fusion, called IGNet. Specifically, we\nfirst apply a multi-scale extractor to achieve shallow features, which are\nemployed as the necessary input to build graph structures. Then, the graph\ninteraction module can construct the extracted intermediate features of the\ninfrared/visible branch into graph structures. Meanwhile, the graph structures\nof two branches interact for cross-modality and semantic learning, so that\nfused images can maintain the important feature expressions and enhance the\nperformance of downstream tasks. Besides, the proposed leader nodes can improve\ninformation propagation in the same modality. Finally, we merge all graph\nfeatures to get the fusion result. Extensive experiments on different datasets\n(TNO, MFNet and M3FD) demonstrate that our IGNet can generate visually\nappealing fused images while scoring averagely 2.59% mAP@.5 and 7.77% mIoU\nhigher in detection and segmentation than the compared state-of-the-art\nmethods. The source code of the proposed IGNet can be available at\nhttps://github.com/lok-18/IGNet.\n","authors":["Jiawei Li","Jiansheng Chen","Jinyuan Liu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2308.03256v1.pdf","comment":"9 pages, 10 figures, ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03244v1","updated":"2023-08-07T01:43:25Z","published":"2023-08-07T01:43:25Z","title":"Mind the Gap: Improving Success Rate of Vision-and-Language Navigation\n  by Revisiting Oracle Success Routes","summary":"  Vision-and-Language Navigation (VLN) aims to navigate to the target location\nby following a given instruction. Unlike existing methods focused on predicting\na more accurate action at each step in navigation, in this paper, we make the\nfirst attempt to tackle a long-ignored problem in VLN: narrowing the gap\nbetween Success Rate (SR) and Oracle Success Rate (OSR). We observe a\nconsistently large gap (up to 9%) on four state-of-the-art VLN methods across\ntwo benchmark datasets: R2R and REVERIE. The high OSR indicates the robot agent\npasses the target location, while the low SR suggests the agent actually fails\nto stop at the target location at last. Instead of predicting actions directly,\nwe propose to mine the target location from a trajectory given by off-the-shelf\nVLN models. Specially, we design a multi-module transformer-based model for\nlearning compact discriminative trajectory viewpoint representation, which is\nused to predict the confidence of being a target location as described in the\ninstruction. The proposed method is evaluated on three widely-adopted datasets:\nR2R, REVERIE and NDH, and shows promising results, demonstrating the potential\nfor more future research.\n","authors":["Chongyang Zhao","Yuankai Qi","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12294v2","updated":"2023-08-07T01:21:19Z","published":"2022-12-23T12:51:42Z","title":"FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos","summary":"  Neural fields, also known as coordinate-based or implicit neural\nrepresentations, have shown a remarkable capability of representing,\ngenerating, and manipulating various forms of signals. For video\nrepresentations, however, mapping pixel-wise coordinates to RGB colors has\nshown relatively low compression performance and slow convergence and inference\nspeed. Frame-wise video representation, which maps a temporal coordinate to its\nentire frame, has recently emerged as an alternative method to represent\nvideos, improving compression rates and encoding speed. While promising, it has\nstill failed to reach the performance of state-of-the-art video compression\nalgorithms. In this work, we propose FFNeRV, a novel method for incorporating\nflow information into frame-wise representations to exploit the temporal\nredundancy across the frames in videos inspired by the standard video codecs.\nFurthermore, we introduce a fully convolutional architecture, enabled by\none-dimensional temporal grids, improving the continuity of spatial features.\nExperimental results show that FFNeRV yields the best performance for video\ncompression and frame interpolation among the methods using frame-wise\nrepresentations or neural fields. To reduce the model size even further, we\ndevise a more compact convolutional architecture using the group and pointwise\nconvolutions. With model compression techniques, including quantization-aware\ntraining and entropy coding, FFNeRV outperforms widely-used standard video\ncodecs (H.264 and HEVC) and performs on par with state-of-the-art video\ncompression algorithms.\n","authors":["Joo Chan Lee","Daniel Rho","Jong Hwan Ko","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2212.12294v2.pdf","comment":"Our project page including code is available at\n  https://maincold2.github.io/ffnerv/"},{"id":"http://arxiv.org/abs/2206.02659v5","updated":"2023-08-07T01:20:01Z","published":"2022-06-06T14:52:46Z","title":"Robust Fine-Tuning of Deep Neural Networks with Hessian-based\n  Generalization Guarantees","summary":"  We consider fine-tuning a pretrained deep neural network on a target task. We\nstudy the generalization properties of fine-tuning to understand the problem of\noverfitting, which has often been observed (e.g., when the target dataset is\nsmall or when the training labels are noisy). Existing generalization measures\nfor deep networks depend on notions such as distance from the initialization\n(i.e., the pretrained network) of the fine-tuned model and noise stability\nproperties of deep networks. This paper identifies a Hessian-based distance\nmeasure through PAC-Bayesian analysis, which is shown to correlate well with\nobserved generalization gaps of fine-tuned models. Theoretically, we prove\nHessian distance-based generalization bounds for fine-tuned models. We also\ndescribe an extended study of fine-tuning against label noise, where\noverfitting is against a critical problem; We present an algorithm and a\ngeneralization error guarantee for this algorithm under a class conditional\nindependent noise model. Empirically, we observe that the Hessian-based\ndistance measure can match the scale of the observed generalization gap of\nfine-tuned models in practice. We also test our algorithm on several image\nclassification tasks with noisy training labels, showing notable gains over\nprior methods, and the Hessian distance measure of the fine-tuned model\ndecreases substantially.\n","authors":["Haotian Ju","Dongyue Li","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.02659v5.pdf","comment":"37 pages. Appeared in ICML 2022"},{"id":"http://arxiv.org/abs/2308.03950v1","updated":"2023-08-07T23:41:55Z","published":"2023-08-07T23:41:55Z","title":"Zero-shot Skeleton-based Action Recognition via Mutual Information\n  Estimation and Maximization","summary":"  Zero-shot skeleton-based action recognition aims to recognize actions of\nunseen categories after training on data of seen categories. The key is to\nbuild the connection between visual and semantic space from seen to unseen\nclasses. Previous studies have primarily focused on encoding sequences into a\nsingular feature vector, with subsequent mapping the features to an identical\nanchor point within the embedded space. Their performance is hindered by 1) the\nignorance of the global visual/semantic distribution alignment, which results\nin a limitation to capture the true interdependence between the two spaces. 2)\nthe negligence of temporal information since the frame-wise features with rich\naction clues are directly pooled into a single feature vector. We propose a new\nzero-shot skeleton-based action recognition method via mutual information (MI)\nestimation and maximization. Specifically, 1) we maximize the MI between visual\nand semantic space for distribution alignment; 2) we leverage the temporal\ninformation for estimating the MI by encouraging MI to increase as more frames\nare observed. Extensive experiments on three large-scale skeleton action\ndatasets confirm the effectiveness of our method. Code:\nhttps://github.com/YujieOuO/SMIE.\n","authors":["Yujie Zhou","Wenwen Qiang","Anyi Rao","Ning Lin","Bing Su","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03950v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2204.11041v2","updated":"2023-08-07T22:47:07Z","published":"2022-04-23T10:19:58Z","title":"Learning by Erasing: Conditional Entropy based Transferable\n  Out-Of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection is essential to handle the distribution\nshifts between training and test scenarios. For a new in-distribution (ID)\ndataset, existing methods require retraining to capture the dataset-specific\nfeature representation or data distribution. In this paper, we propose a deep\ngenerative models (DGM) based transferable OOD detection method, which is\nunnecessary to retrain on a new ID dataset. We design an image erasing strategy\nto equip exclusive conditional entropy distribution for each ID dataset, which\ndetermines the discrepancy of DGM's posteriori ucertainty distribution on\ndifferent ID datasets. Owing to the powerful representation capacity of\nconvolutional neural networks, the proposed model trained on complex dataset\ncan capture the above discrepancy between ID datasets without retraining and\nthus achieve transferable OOD detection. We validate the proposed method on\nfive datasets and verity that ours achieves comparable performance to the\nstate-of-the-art group based OOD detection methods that need to be retrained to\ndeploy on new ID datasets. Our code is available at\nhttps://github.com/oOHCIOo/CETOOD.\n","authors":["Meng Xing","Zhiyong Feng","Yong Su","Changjae Oh"],"pdf_url":"https://arxiv.org/pdf/2204.11041v2.pdf","comment":"update new experimental results"},{"id":"http://arxiv.org/abs/2308.03939v1","updated":"2023-08-07T22:44:26Z","published":"2023-08-07T22:44:26Z","title":"Deterministic Neural Illumination Mapping for Efficient Auto-White\n  Balance Correction","summary":"  Auto-white balance (AWB) correction is a critical operation in image signal\nprocessors for accurate and consistent color correction across various\nillumination scenarios. This paper presents a novel and efficient AWB\ncorrection method that achieves at least 35 times faster processing with\nequivalent or superior performance on high-resolution images for the current\nstate-of-the-art methods. Inspired by deterministic color style transfer, our\napproach introduces deterministic illumination color mapping, leveraging\nlearnable projection matrices for both canonical illumination form and\nAWB-corrected output. It involves feeding high-resolution images and\ncorresponding latent representations into a mapping module to derive a\ncanonical form, followed by another mapping module that maps the pixel values\nto those for the corrected version. This strategy is designed as\nresolution-agnostic and also enables seamless integration of any pre-trained\nAWB network as the backbone. Experimental results confirm the effectiveness of\nour approach, revealing significant performance improvements and reduced time\ncomplexity compared to state-of-the-art methods. Our method provides an\nefficient deep learning-based AWB correction solution, promising real-time,\nhigh-quality color correction for digital imaging applications. Source code is\navailable at https://github.com/birdortyedi/DeNIM/\n","authors":["Furkan Kınlı","Doğa Yılmaz","Barış Özcan","Furkan Kıraç"],"pdf_url":"https://arxiv.org/pdf/2308.03939v1.pdf","comment":"9 pages, 5 figures, ICCV 2023 Workshops (RCV 2023)"},{"id":"http://arxiv.org/abs/2308.03936v1","updated":"2023-08-07T22:39:44Z","published":"2023-08-07T22:39:44Z","title":"ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the\n  Generalization of Histopathology Image Classification Across Unseen Hospitals","summary":"  We propose an exhaustive methodology that leverages all levels of feature\nabstraction, targeting an enhancement in the generalizability of image\nclassification to unobserved hospitals. Our approach incorporates\naugmentation-based self-supervision with common distribution shifts in\nhistopathology scenarios serving as the pretext task. This enables us to derive\ninvariant features from training images without relying on training labels,\nthereby covering different abstraction levels. Moving onto the subsequent\nabstraction level, we employ a domain alignment module to facilitate further\nextraction of invariant features across varying training hospitals. To\nrepresent the highly specific features of participating hospitals, an encoder\nis trained to classify hospital labels, independent of their diagnostic labels.\nThe features from each of these encoders are subsequently disentangled to\nminimize redundancy and segregate the features. This representation, which\nspans a broad spectrum of semantic information, enables the development of a\nmodel demonstrating increased robustness to unseen images from disparate\ndistributions. Experimental results from the PACS dataset (a domain\ngeneralization benchmark), a synthetic dataset created by applying\nhistopathology-specific jitters to the MHIST dataset (defining different\ndomains with varied distribution shifts), and a Renal Cell Carcinoma dataset\nderived from four image repositories from TCGA, collectively indicate that our\nproposed model is adept at managing varying levels of image granularity. Thus,\nit shows improved generalizability when faced with new, out-of-distribution\nhospital images.\n","authors":["Milad Sikaroudi","Shahryar Rahnamayan","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2308.03936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01248v2","updated":"2023-08-07T22:21:24Z","published":"2022-04-04T05:27:40Z","title":"Differentiable Rendering for Synthetic Aperture Radar Imagery","summary":"  There is rising interest in differentiable rendering, which allows explicitly\nmodeling geometric priors and constraints in optimization pipelines using\nfirst-order methods such as backpropagation. Incorporating such domain\nknowledge can lead to deep neural networks that are trained more robustly and\nwith limited data, as well as the capability to solve ill-posed inverse\nproblems. Existing efforts in differentiable rendering have focused on imagery\nfrom electro-optical sensors, particularly conventional RGB-imagery. In this\nwork, we propose an approach for differentiable rendering of Synthetic Aperture\nRadar (SAR) imagery, which combines methods from 3D computer graphics with\nneural rendering. We demonstrate the approach on the inverse graphics problem\nof 3D Object Reconstruction from limited SAR imagery using high-fidelity\nsimulated SAR data.\n","authors":["Michael Wilmanski","Jonathan Tamir"],"pdf_url":"https://arxiv.org/pdf/2204.01248v2.pdf","comment":"This version of the manuscript is an updated preprint which has been\n  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but\n  has not yet been published or processed by IEEE"},{"id":"http://arxiv.org/abs/2307.16074v2","updated":"2023-08-07T22:11:33Z","published":"2023-07-29T20:46:44Z","title":"Iterative Graph Filtering Network for 3D Human Pose Estimation","summary":"  Graph convolutional networks (GCNs) have proven to be an effective approach\nfor 3D human pose estimation. By naturally modeling the skeleton structure of\nthe human body as a graph, GCNs are able to capture the spatial relationships\nbetween joints and learn an efficient representation of the underlying pose.\nHowever, most GCN-based methods use a shared weight matrix, making it\nchallenging to accurately capture the different and complex relationships\nbetween joints. In this paper, we introduce an iterative graph filtering\nframework for 3D human pose estimation, which aims to predict the 3D joint\npositions given a set of 2D joint locations in images. Our approach builds upon\nthe idea of iteratively solving graph filtering with Laplacian regularization\nvia the Gauss-Seidel iterative method. Motivated by this iterative solution, we\ndesign a Gauss-Seidel network (GS-Net) architecture, which makes use of weight\nand adjacency modulation, skip connection, and a pure convolutional block with\nlayer normalization. Adjacency modulation facilitates the learning of edges\nthat go beyond the inherent connections of body joints, resulting in an\nadjusted graph structure that reflects the human skeleton, while skip\nconnections help maintain crucial information from the input layer's initial\nfeatures as the network depth increases. We evaluate our proposed model on two\nstandard benchmark datasets, and compare it with a comprehensive set of strong\nbaseline methods for 3D human pose estimation. Our experimental results\ndemonstrate that our approach outperforms the baseline methods on both\ndatasets, achieving state-of-the-art performance. Furthermore, we conduct\nablation studies to analyze the contributions of different components of our\nmodel architecture and show that the skip connection and adjacency modulation\nhelp improve the model performance.\n","authors":["Zaedul Islam","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2307.16074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05116v2","updated":"2023-08-07T21:50:04Z","published":"2022-12-09T20:45:09Z","title":"Leveraging Contextual Data Augmentation for Generalizable Melanoma\n  Detection","summary":"  While skin cancer detection has been a valuable deep learning application for\nyears, its evaluation has often neglected the context in which testing images\nare assessed. Traditional melanoma classifiers assume that their testing\nenvironments are comparable to the structured images they are trained on. This\npaper challenges this notion and argues that mole size, a critical attribute in\nprofessional dermatology, can be misleading in automated melanoma detection.\nWhile malignant melanomas tend to be larger than benign melanomas, relying\nsolely on size can be unreliable and even harmful when contextual scaling of\nimages is not possible. To address this issue, this implementation proposes a\ncustom model that performs various data augmentation procedures to prevent\noverfitting to incorrect parameters and simulate real-world usage of melanoma\ndetection applications. Multiple custom models employing different forms of\ndata augmentation are implemented to highlight the most significant features of\nmole classifiers. These implementations emphasize the importance of considering\nuser unpredictability when deploying such applications. The caution required\nwhen manually modifying data is acknowledged, as it can result in data loss and\nbiased conclusions. Additionally, the significance of data augmentation in both\nthe dermatology and deep learning communities is considered.\n","authors":["Nick DiSanto","Gavin Harding","Ethan Martinez","Benjamin Sanders"],"pdf_url":"https://arxiv.org/pdf/2212.05116v2.pdf","comment":"6 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.03908v1","updated":"2023-08-07T20:50:54Z","published":"2023-08-07T20:50:54Z","title":"ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings\n  for Video Action Recognition","summary":"  Video Action Recognition (VAR) is a challenging task due to its inherent\ncomplexities. Though different approaches have been explored in the literature,\ndesigning a unified framework to recognize a large number of human actions is\nstill a challenging problem. Recently, Multi-Modal Learning (MML) has\ndemonstrated promising results in this domain. In literature, 2D skeleton or\npose modality has often been used for this task, either independently or in\nconjunction with the visual information (RGB modality) present in videos.\nHowever, the combination of pose, visual information, and text attributes has\nnot been explored yet, though text and pose attributes independently have been\nproven to be effective in numerous computer vision tasks. In this paper, we\npresent the first pose augmented Vision-language model (VLM) for VAR. Notably,\nour scheme achieves an accuracy of 92.81% and 73.02% on two popular human video\naction recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even\nwithout any video data pre-training, and an accuracy of 96.11% and 75.75% after\nkinetics pre-training.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.03908v1.pdf","comment":"7 pages, 3 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2308.03906v1","updated":"2023-08-07T20:48:07Z","published":"2023-08-07T20:48:07Z","title":"TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal\n  Backdoored Models","summary":"  We present a Multimodal Backdoor Defense technique TIJO (Trigger Inversion\nusing Joint Optimization). Recent work arXiv:2112.07668 has demonstrated\nsuccessful backdoor attacks on multimodal models for the Visual Question\nAnswering task. Their dual-key backdoor trigger is split across two modalities\n(image and text), such that the backdoor is activated if and only if the\ntrigger is present in both modalities. We propose TIJO that defends against\ndual-key attacks through a joint optimization that reverse-engineers the\ntrigger in both the image and text modalities. This joint optimization is\nchallenging in multimodal models due to the disconnected nature of the visual\npipeline which consists of an offline feature extractor, whose output is then\nfused with the text using a fusion module. The key insight enabling the joint\noptimization in TIJO is that the trigger inversion needs to be carried out in\nthe object detection box feature space as opposed to the pixel space. We\ndemonstrate the effectiveness of our method on the TrojVQA benchmark, where\nTIJO improves upon the state-of-the-art unimodal methods from an AUC of 0.6 to\n0.92 on multimodal dual-key backdoors. Furthermore, our method also improves\nupon the unimodal baselines on unimodal backdoors. We present ablation studies\nand qualitative results to provide insights into our algorithm such as the\ncritical importance of overlaying the inverted feature triggers on all visual\nfeatures during trigger inversion. The prototype implementation of TIJO is\navailable at https://github.com/SRI-CSL/TIJO.\n","authors":["Indranil Sur","Karan Sikka","Matthew Walmer","Kaushik Koneripalli","Anirban Roy","Xiao Lin","Ajay Divakaran","Susmit Jha"],"pdf_url":"https://arxiv.org/pdf/2308.03906v1.pdf","comment":"Published as conference paper at ICCV 2023. 13 pages, 6 figures, 7\n  tables"},{"id":"http://arxiv.org/abs/2308.03900v1","updated":"2023-08-07T20:23:39Z","published":"2023-08-07T20:23:39Z","title":"Developability Approximation for Neural Implicits through Rank\n  Minimization","summary":"  Developability refers to the process of creating a surface without any\ntearing or shearing from a two-dimensional plane. It finds practical\napplications in the fabrication industry. An essential characteristic of a\ndevelopable 3D surface is its zero Gaussian curvature, which means that either\none or both of the principal curvatures are zero. This paper introduces a\nmethod for reconstructing an approximate developable surface from a neural\nimplicit surface. The central idea of our method involves incorporating a\nregularization term that operates on the second-order derivatives of the neural\nimplicits, effectively promoting zero Gaussian curvature. Implicit surfaces\noffer the advantage of smoother deformation with infinite resolution,\novercoming the high polygonal constraints of state-of-the-art methods using\ndiscrete representations. We draw inspiration from the properties of surface\ncurvature and employ rank minimization techniques derived from compressed\nsensing. Experimental results on both developable and non-developable surfaces,\nincluding those affected by noise, validate the generalizability of our method.\n","authors":["Pratheba Selvaraju"],"pdf_url":"https://arxiv.org/pdf/2308.03900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12622v4","updated":"2023-08-07T20:10:51Z","published":"2023-07-24T08:51:49Z","title":"Phase Matching for Out-of-Distribution Generalization","summary":"  The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous studies have\nindicated that the amplitude spectrum is susceptible to the disturbance caused\nby distribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for robust visual\nrepresentation learning. However, the spatial relationships of phase spectrum\nremain unexplored in previous researches. In this paper, we aim to clarify the\nrelationships between Domain Generalization (DG) and the frequency components,\nand explore the spatial relationships of the phase spectrum. Specifically, we\nfirst introduce a Fourier-based structural causal model which interprets the\nphase spectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our\nmethod introduces perturbations on the amplitude spectrum and establishes\nspatial relationships to match the phase components. Through experiments on\nmultiple benchmarks, we demonstrate that our proposed method achieves\nstate-of-the-art performance in domain generalization and out-of-distribution\nrobustness tasks.\n","authors":["Chengming Hu","Yeqian Du","Rui Wang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12622v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00501v4","updated":"2023-08-07T19:10:18Z","published":"2023-04-02T10:27:34Z","title":"A Comprehensive Review of YOLO: From YOLOv1 and Beyond","summary":"  YOLO has become a central real-time object detection system for robotics,\ndriverless cars, and video monitoring applications. We present a comprehensive\nanalysis of YOLO's evolution, examining the innovations and contributions in\neach iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with\nTransformers. We start by describing the standard metrics and postprocessing;\nthen, we discuss the major changes in network architecture and training tricks\nfor each model. Finally, we summarize the essential lessons from YOLO's\ndevelopment and provide a perspective on its future, highlighting potential\nresearch directions to enhance real-time object detection systems.\n","authors":["Juan Terven","Diana Cordova-Esparza"],"pdf_url":"https://arxiv.org/pdf/2304.00501v4.pdf","comment":"34 pages, 19 figures, 4 tables, submitted to ACM Computing Surveys.\n  This version adds information about YOLO with transformers"},{"id":"http://arxiv.org/abs/2308.03867v1","updated":"2023-08-07T18:39:14Z","published":"2023-08-07T18:39:14Z","title":"From Sky to the Ground: A Large-scale Benchmark and Simple Baseline\n  Towards Real Rain Removal","summary":"  Learning-based image deraining methods have made great progress. However, the\nlack of large-scale high-quality paired training samples is the main bottleneck\nto hamper the real image deraining (RID). To address this dilemma and advance\nRID, we construct a Large-scale High-quality Paired real rain benchmark\n(LHP-Rain), including 3000 video sequences with 1 million high-resolution\n(1920*1080) frame pairs. The advantages of the proposed dataset over the\nexisting ones are three-fold: rain with higher-diversity and larger-scale,\nimage with higher-resolution and higher-quality ground-truth. Specifically, the\nreal rains in LHP-Rain not only contain the classical rain\nstreak/veiling/occlusion in the sky, but also the \\textbf{splashing on the\nground} overlooked by deraining community. Moreover, we propose a novel robust\nlow-rank tensor recovery model to generate the GT with better separating the\nstatic background from the dynamic rain. In addition, we design a simple\ntransformer-based single image deraining baseline, which simultaneously utilize\nthe self-attention and cross-layer attention within the image and rain layer\nwith discriminative feature representation. Extensive experiments verify the\nsuperiority of the proposed dataset and deraining method over state-of-the-art.\n","authors":["Yun Guo","Xueyao Xiao","Yi Chang","Shumin Deng","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.03867v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03865v1","updated":"2023-08-07T18:27:04Z","published":"2023-08-07T18:27:04Z","title":"DefCor-Net: Physics-Aware Ultrasound Deformation Correction","summary":"  The recovery of morphologically accurate anatomical images from deformed ones\nis challenging in ultrasound (US) image acquisition, but crucial to accurate\nand consistent diagnosis, particularly in the emerging field of\ncomputer-assisted diagnosis. This article presents a novel anatomy-aware\ndeformation correction approach based on a coarse-to-fine, multi-scale deep\nneural network (DefCor-Net). To achieve pixel-wise performance, DefCor-Net\nincorporates biomedical knowledge by estimating pixel-wise stiffness online\nusing a U-shaped feature extractor. The deformation field is then computed\nusing polynomial regression by integrating the measured force applied by the US\nprobe. Based on real-time estimation of pixel-by-pixel tissue properties, the\nlearning-based approach enables the potential for anatomy-aware deformation\ncorrection. To demonstrate the effectiveness of the proposed DefCor-Net, images\nrecorded at multiple locations on forearms and upper arms of six volunteers are\nused to train and validate DefCor-Net. The results demonstrate that DefCor-Net\ncan significantly improve the accuracy of deformation correction to recover the\noriginal geometry (Dice Coefficient: from $14.3\\pm20.9$ to $82.6\\pm12.1$ when\nthe force is $6N$).\n","authors":["Zhongliang Jiang","Yue Zhou","Dongliang Cao","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.03865v1.pdf","comment":"Accepted by MedIA. code is available"},{"id":"http://arxiv.org/abs/2308.03861v1","updated":"2023-08-07T18:15:03Z","published":"2023-08-07T18:15:03Z","title":"High-Throughput and Accurate 3D Scanning of Cattle Using Time-of-Flight\n  Sensors and Deep Learning","summary":"  We introduce a high throughput 3D scanning solution specifically designed to\nprecisely measure cattle phenotypes. This scanner leverages an array of depth\nsensors, i.e. time-of-flight (Tof) sensors, each governed by dedicated embedded\ndevices. The system excels at generating high-fidelity 3D point clouds, thus\nfacilitating an accurate mesh that faithfully reconstructs the cattle geometry\non the fly. In order to evaluate the performance of our system, we have\nimplemented a two-fold validation process. Initially, we test the scanner's\ncompetency in determining volume and surface area measurements within a\ncontrolled environment featuring known objects. Secondly, we explore the impact\nand necessity of multi-device synchronization when operating a series of\ntime-of-flight sensors. Based on the experimental results, the proposed system\nis capable of producing high-quality meshes of untamed cattle for livestock\nstudies.\n","authors":["Gbenga Omotara","Seyed Mohamad Ali Tousi","Jared Decker","Derek Brake","Guilherme N. DeSouza"],"pdf_url":"https://arxiv.org/pdf/2308.03861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03826v1","updated":"2023-08-07T17:49:04Z","published":"2023-08-07T17:49:04Z","title":"Recurrent Multi-scale Transformer for High-Resolution Salient Object\n  Detection","summary":"  Salient Object Detection (SOD) aims to identify and segment the most\nconspicuous objects in an image or video. As an important pre-processing step,\nit has many potential applications in multimedia and vision tasks. With the\nadvance of imaging devices, SOD with high-resolution images is of great demand,\nrecently. However, traditional SOD methods are largely limited to\nlow-resolution images, making them difficult to adapt to the development of\nHigh-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no\nlarge enough datasets for training and evaluating. Besides, current HRSOD\nmethods generally produce incomplete object regions and irregular object\nboundaries. To address above issues, in this work, we first propose a new\nHRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K\nresolution. As far as we know, it is the largest dataset for the HRSOD task,\nwhich will significantly help future works in training and evaluating models.\nFurthermore, to improve the HRSOD performance, we propose a novel Recurrent\nMulti-scale Transformer (RMFormer), which recurrently utilizes shared\nTransformers and multi-scale refinement architectures. Thus, high-resolution\nsaliency maps can be generated with the guidance of lower-resolution\npredictions. Extensive experiments on both high-resolution and low-resolution\nbenchmarks show the effectiveness and superiority of the proposed framework.\nThe source code and dataset are released at:\nhttps://github.com/DrowsyMon/RMFormer.\n","authors":["Xinhao Deng","Pingping Zhang","Wei Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03826v1.pdf","comment":"This work is accepted by ACM MM2023. More modifications may be\n  performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03821v1","updated":"2023-08-07T15:30:02Z","published":"2023-08-07T15:30:02Z","title":"Distributionally Robust Classification on a Data Budget","summary":"  Real world uses of deep learning require predictable model behavior under\ndistribution shifts. Models such as CLIP show emergent natural distributional\nrobustness comparable to humans, but may require hundreds of millions of\ntraining samples. Can we train robust learners in a domain where data is\nlimited? To rigorously address this question, we introduce JANuS (Joint\nAnnotations and Names Set), a collection of four new training datasets with\nimages, labels, and corresponding captions, and perform a series of carefully\ncontrolled investigations of factors contributing to robustness in image\nclassification, then compare those results to findings derived from a\nlarge-scale meta-analysis. Using this approach, we show that standard ResNet-50\ntrained with the cross-entropy loss on 2.4 million image samples can attain\ncomparable robustness to a CLIP ResNet-50 trained on 400 million samples. To\nour knowledge, this is the first result showing (near) state-of-the-art\ndistributional robustness on limited data budgets. Our dataset is available at\n\\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used\nto reproduce our experiments can be found at\n\\url{https://github.com/penfever/vlhub/}.\n","authors":["Benjamin Feuer","Ameya Joshi","Minh Pham","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.03821v1.pdf","comment":"TMLR 2023; openreview link:\n  https://openreview.net/forum?id=D5Z2E8CNsD"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03735v1","updated":"2023-08-07T17:34:58Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.03734v1","updated":"2023-08-07T17:32:33Z","published":"2023-08-07T17:32:33Z","title":"Labeling without Seeing? Blind Annotation for Privacy-Preserving Entity\n  Resolution","summary":"  The entity resolution problem requires finding pairs across datasets that\nbelong to different owners but refer to the same entity in the real world. To\ntrain and evaluate solutions (either rule-based or machine-learning-based) to\nthe entity resolution problem, generating a ground truth dataset with entity\npairs or clusters is needed. However, such a data annotation process involves\nhumans as domain oracles to review the plaintext data for all candidate record\npairs from different parties, which inevitably infringes the privacy of data\nowners, especially in privacy-sensitive cases like medical records. To the best\nof our knowledge, there is no prior work on privacy-preserving ground truth\ndataset generation, especially in the domain of entity resolution. We propose a\nnovel blind annotation protocol based on homomorphic encryption that allows\ndomain oracles to collaboratively label ground truths without sharing data in\nplaintext with other parties. In addition, we design a domain-specific\neasy-to-use language that hides the sophisticated underlying homomorphic\nencryption layer. Rigorous proof of the privacy guarantee is provided and our\nempirical experiments via an annotation simulator indicate the feasibility of\nour privacy-preserving protocol (f-measure on average achieves more than 90\\%\ncompared with the real ground truths).\n","authors":["Yixiang Yao","Weizhao Jin","Srivatsan Ravi"],"pdf_url":"https://arxiv.org/pdf/2308.03734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03588v1","updated":"2023-08-07T13:45:48Z","published":"2023-08-07T13:45:48Z","title":"Multi-View Graph Convolutional Network for Multimedia Recommendation","summary":"  Multimedia recommendation has received much attention in recent years. It\nmodels user preferences based on both behavior information and item multimodal\ninformation. Though current GCN-based methods achieve notable success, they\nsuffer from two limitations: (1) Modality noise contamination to the item\nrepresentations. Existing methods often mix modality features and behavior\nfeatures in a single view (e.g., user-item view) for propagation, the noise in\nthe modality features may be amplified and coupled with behavior features. In\nthe end, it leads to poor feature discriminability; (2) Incomplete user\npreference modeling caused by equal treatment of modality features. Users often\nexhibit distinct modality preferences when purchasing different items. Equally\nfusing each modality feature ignores the relative importance among different\nmodalities, leading to the suboptimal user preference modeling. To tackle the\nabove issues, we propose a novel Multi-View Graph Convolutional Network for the\nmultimedia recommendation. Specifically, to avoid modality noise contamination,\nthe modality features are first purified with the aid of item behavior\ninformation. Then, the purified modality features of items and behavior\nfeatures are enriched in separate views, including the user-item view and the\nitem-item view. In this way, the distinguishability of features is enhanced.\nMeanwhile, a behavior-aware fuser is designed to comprehensively model user\npreferences by adaptively learning the relative importance of different\nmodality features. Furthermore, we equip the fuser with a self-supervised\nauxiliary task. This task is expected to maximize the mutual information\nbetween the fused multimodal features and behavior features, so as to capture\ncomplementary and supplementary preference information simultaneously.\nExtensive experiments on three public datasets demonstrate the effectiveness of\nour methods.\n","authors":["Penghang Yu","Zhiyi Tan","Guanming Lu","Bing-Kun Bao"],"pdf_url":"https://arxiv.org/pdf/2308.03588v1.pdf","comment":"MM'23"},{"id":"http://arxiv.org/abs/2308.03578v1","updated":"2023-08-07T13:35:02Z","published":"2023-08-07T13:35:02Z","title":"TeraHAC: Hierarchical Agglomerative Clustering of Trillion-Edge Graphs","summary":"  We introduce TeraHAC, a $(1+\\epsilon)$-approximate hierarchical agglomerative\nclustering (HAC) algorithm which scales to trillion-edge graphs. Our algorithm\nis based on a new approach to computing $(1+\\epsilon)$-approximate HAC, which\nis a novel combination of the nearest-neighbor chain algorithm and the notion\nof $(1+\\epsilon)$-approximate HAC. Our approach allows us to partition the\ngraph among multiple machines and make significant progress in computing the\nclustering within each partition before any communication with other partitions\nis needed.\n  We evaluate TeraHAC on a number of real-world and synthetic graphs of up to 8\ntrillion edges. We show that TeraHAC requires over 100x fewer rounds compared\nto previously known approaches for computing HAC. It is up to 8.3x faster than\nSCC, the state-of-the-art distributed algorithm for hierarchical clustering,\nwhile achieving 1.16x higher quality. In fact, TeraHAC essentially retains the\nquality of the celebrated HAC algorithm while significantly improving the\nrunning time.\n","authors":["Laxman Dhulipala","Jason Lee","Jakub Łącki","Vahab Mirrokni"],"pdf_url":"https://arxiv.org/pdf/2308.03578v1.pdf","comment":"To appear at SIGMOD 2024"},{"id":"http://arxiv.org/abs/2308.03563v1","updated":"2023-08-07T13:15:33Z","published":"2023-08-07T13:15:33Z","title":"Global cognitive graph properties dynamics of hippocampal formation","summary":"  In the present study we have used a set of methods and metrics to build a\ngraph of relative neural connections in a hippocampus of a rodent. A set of\ngraphs was built on top of time-sequenced data and analyzed in terms of\ndynamics of a connection genesis. The analysis has shown that during the\nprocess of a rodent exploring a novel environment, the relations between\nneurons constantly change which indicates that globally memory is constantly\nupdated even for known areas of space. Even if some neurons gain cognitive\nspecialization, the global network though remains relatively stable.\nAdditionally we suggest a set of methods for building a graph of cognitive\nneural network.\n","authors":["Konstantin Sorokin","Andrey Zaitsew","Aleksandr Levin","German Magai","Maxim Beketov","Vladimir Sotskov"],"pdf_url":"https://arxiv.org/pdf/2308.03563v1.pdf","comment":"12 pages, 6 figures, paper for DAMDID 2023 Conference"},{"id":"http://arxiv.org/abs/2308.03470v1","updated":"2023-08-07T10:56:57Z","published":"2023-08-07T10:56:57Z","title":"Uncertainty-aware Consistency Learning for Cold-Start Item\n  Recommendation","summary":"  Graph Neural Network (GNN)-based models have become the mainstream approach\nfor recommender systems. Despite the effectiveness, they are still suffering\nfrom the cold-start problem, i.e., recommend for few-interaction items.\nExisting GNN-based recommendation models to address the cold-start problem\nmainly focus on utilizing auxiliary features of users and items, leaving the\nuser-item interactions under-utilized. However, embeddings distributions of\ncold and warm items are still largely different, since cold items' embeddings\nare learned from lower-popularity interactions, while warm items' embeddings\nare from higher-popularity interactions. Thus, there is a seesaw phenomenon,\nwhere the recommendation performance for the cold and warm items cannot be\nimproved simultaneously. To this end, we proposed a Uncertainty-aware\nConsistency learning framework for Cold-start item recommendation (shorten as\nUCC) solely based on user-item interactions. Under this framework, we train the\nteacher model (generator) and student model (recommender) with consistency\nlearning, to ensure the cold items with additionally generated low-uncertainty\ninteractions can have similar distribution with the warm items. Therefore, the\nproposed framework improves the recommendation of cold and warm items at the\nsame time, without hurting any one of them. Extensive experiments on benchmark\ndatasets demonstrate that our proposed method significantly outperforms\nstate-of-the-art methods on both warm and cold items, with an average\nperformance improvement of 27.6%.\n","authors":["Taichi Liu","Chen Gao","Zhenyu Wang","Dong Li","Jianye Hao","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.03470v1.pdf","comment":"Accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2308.03443v1","updated":"2023-08-07T10:00:07Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03443v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.03400v1","updated":"2023-08-07T08:38:15Z","published":"2023-08-07T08:38:15Z","title":"Hierarchical Contrastive Learning with Multiple Augmentation for\n  Sequential Recommendation","summary":"  Sequential recommendation addresses the issue of preference drift by\npredicting the next item based on the user's previous behaviors. Recently, a\npromising approach using contrastive learning has emerged, demonstrating its\neffectiveness in recommending items under sparse user-item interactions.\nSignificantly, the effectiveness of combinations of various augmentation\nmethods has been demonstrated in different domains, particularly in computer\nvision. However, when it comes to augmentation within a contrastive learning\nframework in sequential recommendation, previous research has only focused on\nlimited conditions and simple structures. Thus, it is still possible to extend\nexisting approaches to boost the effects of augmentation methods by using\nprogressed structures with the combinations of multiple augmentation methods.\nIn this work, we propose a novel framework called Hierarchical Contrastive\nLearning with Multiple Augmentation for Sequential Recommendation(HCLRec) to\novercome the aforementioned limitation. Our framework leverages existing\naugmentation methods hierarchically to improve performance. By combining\naugmentation methods continuously, we generate low-level and high-level view\npairs. We employ a Transformers-based model to encode the input sequence\neffectively. Furthermore, we introduce additional blocks consisting of\nTransformers and position-wise feed-forward network(PFFN) layers to learn the\ninvariance of the original sequences from hierarchically augmented views. We\npass the input sequence to subsequent layers based on the number of increment\nlevels applied to the views to handle various augmentation levels. Within each\nlayer, we compute contrastive loss between pairs of views at the same level.\nExtensive experiments demonstrate that our proposed method outperforms\nstate-of-the-art approaches and that HCLRec is robust even when faced with the\nproblem of sparse interaction.\n","authors":["Dongjun Lee","Donggeun Ko","Jaekwang Kim"],"pdf_url":"https://arxiv.org/pdf/2308.03400v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.03366v1","updated":"2023-08-07T07:41:01Z","published":"2023-08-07T07:41:01Z","title":"POSIT: Promotion of Semantic Item Tail via Adversarial Learning","summary":"  In many recommender problems, a handful of popular items (e.g. movies/TV\nshows, news etc.) can be dominant in recommendations for many users. However,\nwe know that in a large catalog of items, users are likely interested in more\nthan what is popular. The dominance of popular items may mean that users will\nnot see items they would likely enjoy. In this paper, we propose a technique to\novercome this problem using adversarial machine learning. We define a metric to\ntranslate user-level utility metric in terms of an advantage/disadvantage over\nitems. We subsequently use that metric in an adversarial learning framework to\nsystematically promote disadvantaged items. The resulting algorithm identifies\nsemantically meaningful items that get promoted in the learning algorithm. In\nthe empirical study, we evaluate the proposed technique on three publicly\navailable datasets and four competitive baselines. The result shows that our\nproposed method not only improves the coverage, but also, surprisingly,\nimproves the overall performance.\n","authors":["Qiuling Xu","Pannaga Shivaswamy","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03366v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.03333v1","updated":"2023-08-07T06:29:20Z","published":"2023-08-07T06:29:20Z","title":"Heterogeneous Knowledge Fusion: A Novel Approach for Personalized\n  Recommendation via LLM","summary":"  The analysis and mining of user heterogeneous behavior are of paramount\nimportance in recommendation systems. However, the conventional approach of\nincorporating various types of heterogeneous behavior into recommendation\nmodels leads to feature sparsity and knowledge fragmentation issues. To address\nthis challenge, we propose a novel approach for personalized recommendation via\nLarge Language Model (LLM), by extracting and fusing heterogeneous knowledge\nfrom user heterogeneous behavior information. In addition, by combining\nheterogeneous knowledge and recommendation tasks, instruction tuning is\nperformed on LLM for personalized recommendations. The experimental results\ndemonstrate that our method can effectively integrate user heterogeneous\nbehavior and significantly improve recommendation performance.\n","authors":["Bin Yin","Junjie Xie","Yu Qin","Zixiang Ding","Zhichao Feng","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03855v1","updated":"2023-08-07T18:06:46Z","published":"2023-08-07T18:06:46Z","title":"Mobile Supply: The Last Piece of Jigsaw of Recommender System","summary":"  Recommendation system is a fundamental functionality of online platforms.\nWith the development of computing power of mobile phones, some researchers have\ndeployed recommendation algorithms on users' devices to solve the problems of\ndata transmission delay and pagination mechanism. However, the existing\nedge-side mobile rankings cannot completely solve the problem of pagination\nmechanism. The mobile rankings can only sort the items on the current page, so\nit will not work if it is called once or twice. Besides, after the user has\nviewed the items of interest to the user on the current page, the user refresh\nto get a new page of items. This will make the mobile ranking model do a lot of\nuseless work and affect the user's immersive experience. In order to solve the\npagination mechanism problem, we propose a completely new module in the\npipeline of recommender named Mobile Supply. The pipeline of recommender system\nis extended to \"retrival->pre-ranking->ranking->re-ranking->Mobile\nSupply->mobile ranking\". Specifically, we introduce the concept of list value\nand use point-wise method to approximate list-wise estimation. We also design a\nnew mobile ranking named device-aware mobile ranking considering the difference\nof mobile devices tailored to the new pipeline. Extensive offline and online\nexperiments show the superiority of our proposed method and prove that Mobile\nSupply can further improve the performance of edge-side recommender system and\nuser experience. Mobile Supply has been deployed on the homepage page of a\nlarge-scale online food platform and has yielded considerable profits in our\nbusiness.\n","authors":["Zhenhao Jiang","Biao Zeng","Hao Feng","Jin Liu","Jie Zhang","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03842v1","updated":"2023-08-07T18:00:04Z","published":"2023-08-07T18:00:04Z","title":"Search Engine and Recommendation System for the Music Industry built\n  with JinaAI","summary":"  One of the most intriguing debates regarding a novel task is the development\nof search engines and recommendation-based systems in the music industry.\nStudies have shown a drastic depression in the search engine fields, due to\nconcerning factors such as speed, accuracy and the format of data given for\nquerying. Often people face difficulty in searching for a song solely based on\nthe title, hence a solution is proposed to complete a search analysis through a\nsingle query input and is matched with the lyrics of the songs present in the\ndatabase. Hence it is essential to incorporate cutting-edge technology tools\nfor developing a user-friendly search engine. Jina AI is an MLOps framework for\nbuilding neural search engines that are utilized, in order for the user to\nobtain accurate results. Jina AI effectively helps to maintain and enhance the\nquality of performance for the search engine for the query given. An effective\nsearch engine and a recommendation system for the music industry, built with\nJinaAI.\n","authors":["Ishita Gopalakrishnan","Sanjjushri Varshini R","Ponshriharini V"],"pdf_url":"https://arxiv.org/pdf/2308.03842v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2302.07181v2","updated":"2023-08-07T17:59:16Z","published":"2023-02-14T16:49:25Z","title":"Quantum algorithms applied to satellite mission planning for Earth\n  observation","summary":"  Earth imaging satellites are a crucial part of our everyday lives that enable\nglobal tracking of industrial activities. Use cases span many applications,\nfrom weather forecasting to digital maps, carbon footprint tracking, and\nvegetation monitoring. However, there are limitations; satellites are difficult\nto manufacture, expensive to maintain, and tricky to launch into orbit.\nTherefore, satellites must be employed efficiently. This poses a challenge\nknown as the satellite mission planning problem, which could be computationally\nprohibitive to solve on large scales. However, close-to-optimal algorithms,\nsuch as greedy reinforcement learning and optimization algorithms, can often\nprovide satisfactory resolutions. This paper introduces a set of quantum\nalgorithms to solve the mission planning problem and demonstrate an advantage\nover the classical algorithms implemented thus far. The problem is formulated\nas maximizing the number of high-priority tasks completed on real datasets\ncontaining thousands of tasks and multiple satellites. This work demonstrates\nthat through solution-chaining and clustering, optimization and machine\nlearning algorithms offer the greatest potential for optimal solutions. This\npaper notably illustrates that a hybridized quantum-enhanced reinforcement\nlearning agent can achieve a completion percentage of 98.5% over high-priority\ntasks, significantly improving over the baseline greedy methods with a\ncompletion rate of 75.8%. The results presented in this work pave the way to\nquantum-enabled solutions in the space industry and, more generally, future\nmission planning problems across industries.\n","authors":["Serge Rainjonneau","Igor Tokarev","Sergei Iudin","Saaketh Rayaprolu","Karan Pinto","Daria Lemtiuzhnikova","Miras Koblan","Egor Barashov","Mo Kordzanganeh","Markus Pflitsch","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2302.07181v2.pdf","comment":"13 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2211.09027v3","updated":"2023-08-07T17:56:54Z","published":"2022-11-12T10:12:17Z","title":"LLEDA -- Lifelong Self-Supervised Domain Adaptation","summary":"  Humans and animals have the ability to continuously learn new information\nover their lifetime without losing previously acquired knowledge. However,\nartificial neural networks struggle with this due to new information\nconflicting with old knowledge, resulting in catastrophic forgetting. The\ncomplementary learning systems (CLS) theory suggests that the interplay between\nhippocampus and neocortex systems enables long-term and efficient learning in\nthe mammalian brain, with memory replay facilitating the interaction between\nthese two systems to reduce forgetting. The proposed Lifelong Self-Supervised\nDomain Adaptation (LLEDA) framework draws inspiration from the CLS theory and\nmimics the interaction between two networks: a DA network inspired by the\nhippocampus that quickly adjusts to changes in data distribution and an SSL\nnetwork inspired by the neocortex that gradually learns domain-agnostic general\nrepresentations. LLEDA's latent replay technique facilitates communication\nbetween these two networks by reactivating and replaying the past memory latent\nrepresentations to stabilise long-term generalisation and retention without\ninterfering with the previously learned information. Extensive experiments\ndemonstrate that the proposed method outperforms several other methods\nresulting in a long-term adaptation while being less prone to catastrophic\nforgetting when transferred to new domains.\n","authors":["Mamatha Thota","Dewei Yi","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2211.09027v3.pdf","comment":"19 pages, 6 figures, 6 tables; V2 added more experiments on more\n  domains and fixed typos"},{"id":"http://arxiv.org/abs/2308.01390v2","updated":"2023-08-07T17:53:09Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03743v1","updated":"2023-08-07T17:51:09Z","published":"2023-08-07T17:51:09Z","title":"The Copycat Perceptron: Smashing Barriers Through Collective Learning","summary":"  We characterize the equilibrium properties of a model of $y$ coupled binary\nperceptrons in the teacher-student scenario, subject to a suitable learning\nrule, with an explicit ferromagnetic coupling proportional to the Hamming\ndistance between the students' weights. In contrast to recent works, we analyze\na more general setting in which a thermal noise is present that affects the\ngeneralization performance of each student. Specifically, in the presence of a\nnonzero temperature, which assigns nonzero probability to configurations that\nmisclassify samples with respect to the teacher's prescription, we find that\nthe coupling of replicas leads to a shift of the phase diagram to smaller\nvalues of $\\alpha$: This suggests that the free energy landscape gets smoother\naround the solution with good generalization (i.e., the teacher) at a fixed\nfraction of reviewed examples, which allows local update algorithms such as\nSimulated Annealing to reach the solution before the dynamics gets frozen.\nFinally, from a learning perspective, these results suggest that more students\n(in this case, with the same amount of data) are able to learn the same rule\nwhen coupled together with a smaller amount of data.\n","authors":["Giovanni Catania","Aurélien Decelle","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2308.03743v1.pdf","comment":"4 figures"},{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2301.09656v3","updated":"2023-08-07T17:40:40Z","published":"2023-01-23T19:00:02Z","title":"Selective Explanations: Leveraging Human Input to Align Explainable AI","summary":"  While a vast collection of explainable AI (XAI) algorithms have been\ndeveloped in recent years, they are often criticized for significant gaps with\nhow humans produce and consume explanations. As a result, current XAI\ntechniques are often found to be hard to use and lack effectiveness. In this\nwork, we attempt to close these gaps by making AI explanations selective -- a\nfundamental property of human explanations -- by selectively presenting a\nsubset from a large set of model reasons based on what aligns with the\nrecipient's preferences. We propose a general framework for generating\nselective explanations by leveraging human input on a small sample. This\nframework opens up a rich design space that accounts for different selectivity\ngoals, types of input, and more. As a showcase, we use a decision-support task\nto explore selective explanations based on what the decision-maker would\nconsider relevant to the decision task. We conducted two experimental studies\nto examine three out of a broader possible set of paradigms based on our\nproposed framework: in Study 1, we ask the participants to provide their own\ninput to generate selective explanations, with either open-ended or\ncritique-based input. In Study 2, we show participants selective explanations\nbased on input from a panel of similar users (annotators). Our experiments\ndemonstrate the promise of selective explanations in reducing over-reliance on\nAI and improving decision outcomes and subjective perceptions of the AI, but\nalso paint a nuanced picture that attributes some of these positive effects to\nthe opportunity to provide one's own input to augment AI explanations. Overall,\nour work proposes a novel XAI framework inspired by human communication\nbehaviors and demonstrates its potentials to encourage future work to better\nalign AI explanations with human production and consumption of explanations.\n","authors":["Vivian Lai","Yiming Zhang","Chacha Chen","Q. Vera Liao","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2301.09656v3.pdf","comment":"21 pages, 25 figures"},{"id":"http://arxiv.org/abs/2308.03735v1","updated":"2023-08-07T17:34:58Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.03730v1","updated":"2023-08-07T17:18:37Z","published":"2023-08-07T17:18:37Z","title":"SurvBeX: An explanation method of the machine learning survival models\n  based on the Beran estimator","summary":"  An explanation method called SurvBeX is proposed to interpret predictions of\nthe machine learning survival black-box models. The main idea behind the method\nis to use the modified Beran estimator as the surrogate explanation model.\nCoefficients, incorporated into Beran estimator, can be regarded as values of\nthe feature impacts on the black-box model prediction. Following the well-known\nLIME method, many points are generated in a local area around an example of\ninterest. For every generated example, the survival function of the black-box\nmodel is computed, and the survival function of the surrogate model (the Beran\nestimator) is constructed as a function of the explanation coefficients. In\norder to find the explanation coefficients, it is proposed to minimize the mean\ndistance between the survival functions of the black-box model and the Beran\nestimator produced by the generated examples. Many numerical experiments with\nsynthetic and real survival data demonstrate the SurvBeX efficiency and compare\nthe method with the well-known method SurvLIME. The method is also compared\nwith the method SurvSHAP. The code implementing SurvBeX is available at:\nhttps://github.com/DanilaEremenko/SurvBeX\n","authors":["Lev V. Utkin","Danila Y. Eremenko","Andrei V. Konstantinov"],"pdf_url":"https://arxiv.org/pdf/2308.03730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08049v9","updated":"2023-08-07T17:09:10Z","published":"2022-12-15T18:55:23Z","title":"Sliced Optimal Partial Transport","summary":"  Optimal transport (OT) has become exceedingly popular in machine learning,\ndata science, and computer vision. The core assumption in the OT problem is the\nequal total amount of mass in source and target measures, which limits its\napplication. Optimal Partial Transport (OPT) is a recently proposed solution to\nthis limitation. Similar to the OT problem, the computation of OPT relies on\nsolving a linear programming problem (often in high dimensions), which can\nbecome computationally prohibitive. In this paper, we propose an efficient\nalgorithm for calculating the OPT problem between two non-negative measures in\none dimension. Next, following the idea of sliced OT distances, we utilize\nslicing to define the sliced OPT distance. Finally, we demonstrate the\ncomputational and accuracy benefits of the sliced OPT-based method in various\nnumerical experiments. In particular, we show an application of our proposed\nSliced-OPT in noisy point cloud registration.\n","authors":["Yikun Bai","Berhnard Schmitzer","Mathew Thorpe","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2212.08049v9.pdf","comment":"modify the link of Github page"},{"id":"http://arxiv.org/abs/2307.14361v2","updated":"2023-08-07T17:09:07Z","published":"2023-07-24T21:01:46Z","title":"A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer\n  using LSTM, BiLSTM, CNN, GRU, and GloVe","summary":"  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and\nGloVe to classify gene mutations using Kaggle's Personalized Medicine:\nRedefining Cancer Treatment dataset. The results were compared against\nwell-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and\ntheir LSTM ensembles. Our model outperformed all other models in terms of\naccuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it\nalso needed less training time, resulting in a perfect combination of\nperformance and efficiency. This study demonstrates the utility of ensemble\nmodels for difficult tasks such as gene mutation classification.\n","authors":["Sanad Aburass","Osama Dorgham","Jamil Al Shaqsi"],"pdf_url":"https://arxiv.org/pdf/2307.14361v2.pdf","comment":"6 pages, 7 figures and 2 tables"},{"id":"http://arxiv.org/abs/2308.01157v2","updated":"2023-08-07T17:06:56Z","published":"2023-08-02T13:59:35Z","title":"LLMs Understand Glass-Box Models, Discover Surprises, and Suggest\n  Repairs","summary":"  We show that large language models (LLMs) are remarkably good at working with\ninterpretable models that decompose complex outcomes into univariate\ngraph-represented components. By adopting a hierarchical approach to reasoning,\nLLMs can provide comprehensive model-level summaries without ever requiring the\nentire model to fit in context. This approach enables LLMs to apply their\nextensive background knowledge to automate common tasks in data science such as\ndetecting anomalies that contradict prior knowledge, describing potential\nreasons for the anomalies, and suggesting repairs that would remove the\nanomalies. We use multiple examples in healthcare to demonstrate the utility of\nthese new capabilities of LLMs, with particular emphasis on Generalized\nAdditive Models (GAMs). Finally, we present the package $\\texttt{TalkToEBM}$ as\nan open-source LLM-GAM interface.\n","authors":["Benjamin J. Lengerich","Sebastian Bordt","Harsha Nori","Mark E. Nunnally","Yin Aphinyanaphongs","Manolis Kellis","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2308.01157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03723v1","updated":"2023-08-07T16:58:48Z","published":"2023-08-07T16:58:48Z","title":"Dimensionality Reduction for Improving Out-of-Distribution Detection in\n  Medical Image Segmentation","summary":"  Clinically deployed segmentation models are known to fail on data outside of\ntheir training distribution. As these models perform well on most cases, it is\nimperative to detect out-of-distribution (OOD) images at inference to protect\nagainst automation bias. This work applies the Mahalanobis distance post hoc to\nthe bottleneck features of a Swin UNETR model that segments the liver on\nT1-weighted magnetic resonance imaging. By reducing the dimensions of the\nbottleneck features with principal component analysis, OOD images were detected\nwith high performance and minimal computational load.\n","authors":["McKell Woodland","Nihil Patel","Mais Al Taie","Joshua P. Yung","Tucker J. Netherton","Ankit B. Patel","Kristy K. Brock"],"pdf_url":"https://arxiv.org/pdf/2308.03723v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections. The Version of Record of this contribution will\n  be published in the Proceedings of Uncertainty for Safe Utilization of\n  Machine Learning in Medical Imaging (5th International Workshop) - Held in\n  conjunction with MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.02029v2","updated":"2023-08-07T16:36:59Z","published":"2023-08-03T20:45:11Z","title":"Deep Maxout Network-based Feature Fusion and Political Tangent Search\n  Optimizer enabled Transfer Learning for Thalassemia Detection","summary":"  Thalassemia is a heritable blood disorder which is the outcome of a genetic\ndefect causing lack of production of hemoglobin polypeptide chains. However,\nthere is less understanding of the precise frequency as well as sharing in\nthese areas. Knowing about the frequency of thalassemia occurrence and\ndependable mutations is thus a significant step in preventing, controlling, and\ntreatment planning. Here, Political Tangent Search Optimizer based Transfer\nLearning (PTSO_TL) is introduced for thalassemia detection. Initially, input\ndata obtained from a particular dataset is normalized in the data normalization\nstage. Quantile normalization is utilized in the data normalization stage, and\nthe data are then passed to the feature fusion phase, in which Weighted\nEuclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data\naugmentation is performed using the oversampling method to increase data\ndimensionality. Lastly, thalassemia detection is carried out by TL, wherein a\nconvolutional neural network (CNN) is utilized with hyperparameters from a\ntrained model such as Xception. TL is tuned by PTSO, and the training algorithm\nPTSO is presented by merging of Political Optimizer (PO) and Tangent Search\nAlgorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and\nf-measure values of about 94.3%, 96.1%, and 95.2%, respectively.\n","authors":["Hemn Barzan Abdalla","Awder Ahmed","Guoquan Li","Nasser Mustafa","Abdur Rashid Sangi"],"pdf_url":"https://arxiv.org/pdf/2308.02029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03713v1","updated":"2023-08-07T16:32:14Z","published":"2023-08-07T16:32:14Z","title":"Communication-Efficient Framework for Distributed Image Semantic\n  Wireless Transmission","summary":"  Multi-node communication, which refers to the interaction among multiple\ndevices, has attracted lots of attention in many Internet-of-Things (IoT)\nscenarios. However, its huge amounts of data flows and inflexibility for task\nextension have triggered the urgent requirement of communication-efficient\ndistributed data transmission frameworks. In this paper, inspired by the great\nsuperiorities on bandwidth reduction and task adaptation of semantic\ncommunications, we propose a federated learning-based semantic communication\n(FLSC) framework for multi-task distributed image transmission with IoT\ndevices. Federated learning enables the design of independent semantic\ncommunication link of each user while further improves the semantic extraction\nand task performance through global aggregation. Each link in FLSC is composed\nof a hierarchical vision transformer (HVT)-based extractor and a task-adaptive\ntranslator for coarse-to-fine semantic extraction and meaning translation\naccording to specific tasks. In order to extend the FLSC into more realistic\nconditions, we design a channel state information-based multiple-input\nmultiple-output transmission module to combat channel fading and noise.\nSimulation results show that the coarse semantic information can deal with a\nrange of image-level tasks. Moreover, especially in low signal-to-noise ratio\nand channel bandwidth ratio regimes, FLSC evidently outperforms the traditional\nscheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel\ncondition.\n","authors":["Bingyan Xie","Yongpeng Wu","Yuxuan Shi","Derrick Wing Kwan Ng","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03713v1.pdf","comment":"This paper has been accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2308.03712v1","updated":"2023-08-07T16:31:38Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach human-level\naccuracy on ImageNet. Human-level competence is thus achievable for a\nfundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v1.pdf","comment":"7 pages, 3 figures, 2 tables; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.03704v1","updated":"2023-08-07T16:22:59Z","published":"2023-08-07T16:22:59Z","title":"DeRisk: An Effective Deep Learning Framework for Credit Risk Prediction\n  over Real-World Financial Data","summary":"  Despite the tremendous advances achieved over the past years by deep learning\ntechniques, the latest risk prediction models for industrial applications still\nrely on highly handtuned stage-wised statistical learning tools, such as\ngradient boosting and random forest methods. Different from images or\nlanguages, real-world financial data are high-dimensional, sparse, noisy and\nextremely imbalanced, which makes deep neural network models particularly\nchallenging to train and fragile in practice. In this work, we propose DeRisk,\nan effective deep learning risk prediction framework for credit risk prediction\non real-world financial data. DeRisk is the first deep risk prediction model\nthat outperforms statistical learning approaches deployed in our company's\nproduction system. We also perform extensive ablation studies on our method to\npresent the most critical factors for the empirical success of DeRisk.\n","authors":["Yancheng Liang","Jiajie Zhang","Hui Li","Xiaochen Liu","Yi Hu","Yong Wu","Jinyao Zhang","Yongyan Liu","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07448v2","updated":"2023-08-07T16:10:43Z","published":"2023-05-12T13:05:32Z","title":"Deep Deterministic Policy Gradient for End-to-End Communication Systems\n  without Prior Channel Knowledge","summary":"  End-to-End (E2E) learning-based concept has been recently introduced to\njointly optimize both the transmitter and the receiver in wireless\ncommunication systems. Unfortunately, this E2E learning architecture requires a\nprior differentiable channel model to jointly train the deep neural networks\n(DNNs) at the transceivers, which is hardly obtained in practice. This paper\naims to solve this issue by developing a deep deterministic policy gradient\n(DDPG)-based framework. In particular, the proposed solution uses the loss\nvalue of the receiver DNN as the reward to train the transmitter DNN. The\nsimulation results then show that our proposed solution can jointly train the\ntransmitter and the receiver without requiring the prior channel model. In\naddition, we demonstrate that the proposed DDPG-based solution can achieve\nbetter detection performance compared to the state-of-the-art solutions.\n","authors":["Bolun Zhang","Nguyen Van Huynh"],"pdf_url":"https://arxiv.org/pdf/2305.07448v2.pdf","comment":"submitted to IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2308.03688v1","updated":"2023-08-07T16:08:11Z","published":"2023-08-07T16:08:11Z","title":"AgentBench: Evaluating LLMs as Agents","summary":"  Large Language Models (LLMs) are becoming increasingly smart and autonomous,\ntargeting real-world pragmatic missions beyond traditional NLP tasks. As a\nresult, there has been an urgent need to evaluate LLMs as agents on challenging\ntasks in interactive environments. We present AgentBench, a multi-dimensional\nevolving benchmark that currently consists of 8 distinct environments to assess\nLLM-as-Agent's reasoning and decision-making abilities in a multi-turn\nopen-ended generation setting. Our extensive test over 25 LLMs (including APIs\nand open-sourced models) shows that, while top commercial LLMs present a strong\nability of acting as agents in complex environments, there is a significant\ndisparity in performance between them and open-sourced competitors. It also\nserves as a component of an ongoing project with wider coverage and deeper\nconsideration towards systematic LLM evaluation. Datasets, environments, and an\nintegrated evaluation package for AgentBench are released at\nhttps://github.com/THUDM/AgentBench\n","authors":["Xiao Liu","Hao Yu","Hanchen Zhang","Yifan Xu","Xuanyu Lei","Hanyu Lai","Yu Gu","Hangliang Ding","Kaiwen Men","Kejuan Yang","Shudan Zhang","Xiang Deng","Aohan Zeng","Zhengxiao Du","Chenhui Zhang","Sheng Shen","Tianjun Zhang","Yu Su","Huan Sun","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03688v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2308.00086v2","updated":"2023-08-07T16:04:02Z","published":"2023-07-28T10:33:12Z","title":"Unsupervised machine-learning shock-capturing technique for high-order\n  solvers","summary":"  We present a novel unsupervised machine learning shock capturing algorithm\nbased on Gaussian Mixture Models (GMMs). The proposed GMM sensor demonstrates\nremarkable accuracy in detecting shocks and is robust across diverse test cases\nwithout the need for parameter tuning. We compare the GMM-based sensor with\nstate-of-the-art alternatives. All methods are integrated into a high-order\ncompressible discontinuous Galerkin solver where artificial viscosity can be\nmodulated to capture shocks. Supersonic test cases, including high Reynolds\nnumbers, showcase the sensor's performance, demonstrating the same\neffectiveness as fine-tuned state-of-the-art sensors. %The nodal DG aproach\nallows for potential applications in sub-cell flux-differencing formulations,\nsupersonic feature detection, and mesh refinement. The adaptive nature and\nability to function without extensive training datasets make this GMM-based\nsensor suitable for complex geometries and varied flow configurations. Our\nstudy reveals the potential of unsupervised machine learning methods,\nexemplified by the GMM sensor, to improve the robustness and efficiency of\nadvanced CFD codes.\n","authors":["Andrés Mateo-Gabín","Kenza Tlales","Eusebio Valero","Esteban Ferrer","Gonzalo Rubio"],"pdf_url":"https://arxiv.org/pdf/2308.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03687v1","updated":"2023-08-07T16:03:40Z","published":"2023-08-07T16:03:40Z","title":"Almost-sure convergence of iterates and multipliers in stochastic\n  sequential quadratic optimization","summary":"  Stochastic sequential quadratic optimization (SQP) methods for solving\ncontinuous optimization problems with nonlinear equality constraints have\nattracted attention recently, such as for solving large-scale data-fitting\nproblems subject to nonconvex constraints. However, for a recently proposed\nsubclass of such methods that is built on the popular stochastic-gradient\nmethodology from the unconstrained setting, convergence guarantees have been\nlimited to the asymptotic convergence of the expected value of a stationarity\nmeasure to zero. This is in contrast to the unconstrained setting in which\nalmost-sure convergence guarantees (of the gradient of the objective to zero)\ncan be proved for stochastic-gradient-based methods. In this paper, new\nalmost-sure convergence guarantees for the primal iterates, Lagrange\nmultipliers, and stationarity measures generated by a stochastic SQP algorithm\nin this subclass of methods are proved. It is shown that the error in the\nLagrange multipliers can be bounded by the distance of the primal iterate to a\nprimal stationary point plus the error in the latest stochastic gradient\nestimate. It is further shown that, subject to certain assumptions, this latter\nerror can be made to vanish by employing a running average of the Lagrange\nmultipliers that are computed during the run of the algorithm. The results of\nnumerical experiments are provided to demonstrate the proved theoretical\nguarantees.\n","authors":["Frank E. Curtis","Xin Jiang","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03686v1","updated":"2023-08-07T16:01:14Z","published":"2023-08-07T16:01:14Z","title":"Linear Convergence Bounds for Diffusion Models via Stochastic\n  Localization","summary":"  Diffusion models are a powerful method for generating approximate samples\nfrom high-dimensional data distributions. Several recent results have provided\npolynomial bounds on the convergence rate of such models, assuming\n$L^2$-accurate score estimators. However, up until now the best known such\nbounds were either superlinear in the data dimension or required strong\nsmoothness assumptions. We provide the first convergence bounds which are\nlinear in the data dimension (up to logarithmic factors) assuming only finite\nsecond moments of the data distribution. We show that diffusion models require\nat most $\\tilde O(\\frac{d \\log^2(1/\\delta)}{\\varepsilon^2})$ steps to\napproximate an arbitrary data distribution on $\\mathbb{R}^d$ corrupted with\nGaussian noise of variance $\\delta$ to within $\\varepsilon^2$ in\nKullback--Leibler divergence. Our proof builds on the Girsanov-based methods of\nprevious works. We introduce a refined treatment of the error arising from the\ndiscretization of the reverse SDE, which is based on tools from stochastic\nlocalization.\n","authors":["Joe Benton","Valentin De Bortoli","Arnaud Doucet","George Deligiannidis"],"pdf_url":"https://arxiv.org/pdf/2308.03686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03670v1","updated":"2023-08-07T15:44:58Z","published":"2023-08-07T15:44:58Z","title":"Improving FHB Screening in Wheat Breeding Using an Efficient Transformer\n  Model","summary":"  Fusarium head blight is a devastating disease that causes significant\neconomic losses annually on small grains. Efficiency, accuracy, and timely\ndetection of FHB in the resistance screening are critical for wheat and barley\nbreeding programs. In recent years, various image processing techniques have\nbeen developed using supervised machine learning algorithms for the early\ndetection of FHB. The state-of-the-art convolutional neural network-based\nmethods, such as U-Net, employ a series of encoding blocks to create a local\nrepresentation and a series of decoding blocks to capture the semantic\nrelations. However, these methods are not often capable of long-range modeling\ndependencies inside the input data, and their ability to model multi-scale\nobjects with significant variations in texture and shape is limited. Vision\ntransformers as alternative architectures with innate global self-attention\nmechanisms for sequence-to-sequence prediction, due to insufficient low-level\ndetails, may also limit localization capabilities. To overcome these\nlimitations, a new Context Bridge is proposed to integrate the local\nrepresentation capability of the U-Net network in the transformer model. In\naddition, the standard attention mechanism of the original transformer is\nreplaced with Efficient Self-attention, which is less complicated than other\nstate-of-the-art methods. To train the proposed network, 12,000 wheat images\nfrom an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were\ncaptured. In addition to healthy and unhealthy plants, these images encompass\nvarious stages of the disease. A team of expert pathologists annotated the\nimages for training and evaluating the developed model. As a result, the\neffectiveness of the transformer-based method for FHB-disease detection,\nthrough extensive experiments across typical tasks for plant image\nsegmentation, is demonstrated.\n","authors":["Babak Azad","Ahmed Abdalla","Kwanghee Won","Ali Mirzakhani Nafchi"],"pdf_url":"https://arxiv.org/pdf/2308.03670v1.pdf","comment":"10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual\n  International Meeting conference in Omaha, Nebraska. Also available at\n  https://elibrary.asabe.org/abstract.asp?aid=54149"},{"id":"http://arxiv.org/abs/2308.03669v1","updated":"2023-08-07T15:40:34Z","published":"2023-08-07T15:40:34Z","title":"Diffusion Model in Causal Inference with Unmeasured Confounders","summary":"  We study how to extend the use of the diffusion model to answer the causal\nquestion from the observational data under the existence of unmeasured\nconfounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to\ncapture the causal intervention, a Diffusion-based Causal Model (DCM) was\nproposed incorporating the diffusion model to answer the causal questions more\naccurately, assuming that all of the confounders are observed. However,\nunmeasured confounders in practice exist, which hinders DCM from being\napplicable. To alleviate this limitation of DCM, we propose an extended model\ncalled Backdoor Criterion based DCM (BDCM), whose idea is rooted in the\nBackdoor criterion to find the variables in DAG to be included in the decoding\nprocess of the diffusion model so that we can extend DCM to the case with\nunmeasured confounders. Synthetic data experiment demonstrates that our\nproposed model captures the counterfactual distribution more precisely than DCM\nunder the unmeasured confounders.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03669v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.05365v6","updated":"2023-08-07T15:39:37Z","published":"2023-04-11T17:20:37Z","title":"Did we personalize? Assessing personalization by an online reinforcement\n  learning algorithm using resampling","summary":"  There is a growing interest in using reinforcement learning (RL) to\npersonalize sequences of treatments in digital health to support users in\nadopting healthier behaviors. Such sequential decision-making problems involve\ndecisions about when to treat and how to treat based on the user's context\n(e.g., prior activity level, location, etc.). Online RL is a promising\ndata-driven approach for this problem as it learns based on each user's\nhistorical responses and uses that knowledge to personalize these decisions.\nHowever, to decide whether the RL algorithm should be included in an\n``optimized'' intervention for real-world deployment, we must assess the data\nevidence indicating that the RL algorithm is actually personalizing the\ntreatments to its users. Due to the stochasticity in the RL algorithm, one may\nget a false impression that it is learning in certain states and using this\nlearning to provide specific treatments. We use a working definition of\npersonalization and introduce a resampling-based methodology for investigating\nwhether the personalization exhibited by the RL algorithm is an artifact of the\nRL algorithm stochasticity. We illustrate our methodology with a case study by\nanalyzing the data from a physical activity clinical trial called HeartSteps,\nwhich included the use of an online RL algorithm. We demonstrate how our\napproach enhances data-driven truth-in-advertising of algorithm personalization\nboth across all users as well as within specific users in the study.\n","authors":["Susobhan Ghosh","Raphael Kim","Prasidh Chhabria","Raaz Dwivedi","Predrag Klasnja","Peng Liao","Kelly Zhang","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2304.05365v6.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2308.03666v1","updated":"2023-08-07T15:35:32Z","published":"2023-08-07T15:35:32Z","title":"Bridging Trustworthiness and Open-World Learning: An Exploratory Neural\n  Approach for Enhancing Interpretability, Generalization, and Robustness","summary":"  As researchers strive to narrow the gap between machine intelligence and\nhuman through the development of artificial intelligence technologies, it is\nimperative that we recognize the critical importance of trustworthiness in\nopen-world, which has become ubiquitous in all aspects of daily life for\neveryone. However, several challenges may create a crisis of trust in current\nartificial intelligence systems that need to be bridged: 1) Insufficient\nexplanation of predictive results; 2) Inadequate generalization for learning\nmodels; 3) Poor adaptability to uncertain environments. Consequently, we\nexplore a neural program to bridge trustworthiness and open-world learning,\nextending from single-modal to multi-modal scenarios for readers. 1) To enhance\ndesign-level interpretability, we first customize trustworthy networks with\nspecific physical meanings; 2) We then design environmental well-being\ntask-interfaces via flexible learning regularizers for improving the\ngeneralization of trustworthy learning; 3) We propose to increase the\nrobustness of trustworthy learning by integrating open-world recognition losses\nwith agent mechanisms. Eventually, we enhance various trustworthy properties\nthrough the establishment of design-level explainability, environmental\nwell-being task-interfaces and open-world recognition programs. These designed\nopen-world protocols are applicable across a wide range of surroundings, under\nopen-world multimedia recognition scenarios with significant performance\nimprovements observed.\n","authors":["Shide Du","Zihan Fang","Shiyang Lan","Yanchao Tan","Manuel Günther","Shiping Wang","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2308.03666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03664v1","updated":"2023-08-07T15:28:39Z","published":"2023-08-07T15:28:39Z","title":"Two-stage Early Prediction Framework of Remaining Useful Life for\n  Lithium-ion Batteries","summary":"  Early prediction of remaining useful life (RUL) is crucial for effective\nbattery management across various industries, ranging from household appliances\nto large-scale applications. Accurate RUL prediction improves the reliability\nand maintainability of battery technology. However, existing methods have\nlimitations, including assumptions of data from the same sensors or\ndistribution, foreknowledge of the end of life (EOL), and neglect to determine\nthe first prediction cycle (FPC) to identify the start of the unhealthy stage.\nThis paper proposes a novel method for RUL prediction of Lithium-ion batteries.\nThe proposed framework comprises two stages: determining the FPC using a neural\nnetwork-based model to divide the degradation data into distinct health states\nand predicting the degradation pattern after the FPC to estimate the remaining\nuseful life as a percentage. Experimental results demonstrate that the proposed\nmethod outperforms conventional approaches in terms of RUL prediction.\nFurthermore, the proposed method shows promise for real-world scenarios,\nproviding improved accuracy and applicability for battery management.\n","authors":["Dhruv Mittal","Hymalai Bello","Bo Zhou","Mayank Shekhar Jha","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2308.03664v1.pdf","comment":"Accepted at the 49th Annual Conference of the IEEE Industrial\n  Electronics Society (IECON 2023)"},{"id":"http://arxiv.org/abs/2308.03661v1","updated":"2023-08-07T15:24:49Z","published":"2023-08-07T15:24:49Z","title":"Matrix Completion in Almost-Verification Time","summary":"  We give a new framework for solving the fundamental problem of low-rank\nmatrix completion, i.e., approximating a rank-$r$ matrix $\\mathbf{M} \\in\n\\mathbb{R}^{m \\times n}$ (where $m \\ge n$) from random observations. First, we\nprovide an algorithm which completes $\\mathbf{M}$ on $99\\%$ of rows and columns\nunder no further assumptions on $\\mathbf{M}$ from $\\approx mr$ samples and\nusing $\\approx mr^2$ time. Then, assuming the row and column spans of\n$\\mathbf{M}$ satisfy additional regularity properties, we show how to boost\nthis partial completion guarantee to a full matrix completion algorithm by\naggregating solutions to regression problems involving the observations.\n  In the well-studied setting where $\\mathbf{M}$ has incoherent row and column\nspans, our algorithms complete $\\mathbf{M}$ to high precision from\n$mr^{2+o(1)}$ observations in $mr^{3 + o(1)}$ time (omitting logarithmic\nfactors in problem parameters), improving upon the prior state-of-the-art\n[JN15] which used $\\approx mr^5$ samples and $\\approx mr^7$ time. Under an\nassumption on the row and column spans of $\\mathbf{M}$ we introduce (which is\nsatisfied by random subspaces with high probability), our sample complexity\nimproves to an almost information-theoretically optimal $mr^{1 + o(1)}$, and\nour runtime improves to $mr^{2 + o(1)}$. Our runtimes have the appealing\nproperty of matching the best known runtime to verify that a rank-$r$\ndecomposition $\\mathbf{U}\\mathbf{V}^\\top$ agrees with the sampled observations.\nWe also provide robust variants of our algorithms that, given random\nobservations from $\\mathbf{M} + \\mathbf{N}$ with $\\|\\mathbf{N}\\|_{F} \\le\n\\Delta$, complete $\\mathbf{M}$ to Frobenius norm distance $\\approx\nr^{1.5}\\Delta$ in the same runtimes as the noiseless setting. Prior noisy\nmatrix completion algorithms [CP10] only guaranteed a distance of $\\approx\n\\sqrt{n}\\Delta$.\n","authors":["Jonathan A. Kelner","Jerry Li","Allen Liu","Aaron Sidford","Kevin Tian"],"pdf_url":"https://arxiv.org/pdf/2308.03661v1.pdf","comment":"FOCS 2023"},{"id":"http://arxiv.org/abs/2308.03648v1","updated":"2023-08-07T14:58:53Z","published":"2023-08-07T14:58:53Z","title":"Generative Forests","summary":"  Tabular data represents one of the most prevalent form of data. When it comes\nto data generation, many approaches would learn a density for the data\ngeneration process, but would not necessarily end up with a sampler, even less\nso being exact with respect to the underlying density. A second issue is on\nmodels: while complex modeling based on neural nets thrives in image or text\ngeneration (etc.), less is known for powerful generative models on tabular\ndata. A third problem is the visible chasm on tabular data between training\nalgorithms for supervised learning with remarkable properties (e.g. boosting),\nand a comparative lack of guarantees when it comes to data generation. In this\npaper, we tackle the three problems, introducing new tree-based generative\nmodels convenient for density modeling and tabular data generation that improve\non modeling capabilities of recent proposals, and a training algorithm which\nsimplifies the training setting of previous approaches and displays\nboosting-compliant convergence. This algorithm has the convenient property to\nrely on a supervised training scheme that can be implemented by a few tweaks to\nthe most popular induction scheme for decision tree induction with two classes.\nExperiments are provided on missing data imputation and comparing generated\ndata to real data, displaying the quality of the results obtained by our\napproach, in particular against state of the art.\n","authors":["Richard Nock","Mathieu Guillame-Bert"],"pdf_url":"https://arxiv.org/pdf/2308.03648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05174v5","updated":"2023-08-07T14:37:00Z","published":"2022-08-10T06:36:49Z","title":"FedOBD: Opportunistic Block Dropout for Efficiently Training Large-scale\n  Neural Networks through Federated Learning","summary":"  Large-scale neural networks possess considerable expressive power. They are\nwell-suited for complex learning tasks in industrial applications. However,\nlarge-scale models pose significant challenges for training under the current\nFederated Learning (FL) paradigm. Existing approaches for efficient FL training\noften leverage model parameter dropout. However, manipulating individual model\nparameters is not only inefficient in meaningfully reducing the communication\noverhead when training large-scale FL models, but may also be detrimental to\nthe scaling efforts and model performance as shown by recent research. To\naddress these issues, we propose the Federated Opportunistic Block Dropout\n(FedOBD) approach. The key novelty is that it decomposes large-scale models\ninto semantic blocks so that FL participants can opportunistically upload\nquantized blocks, which are deemed to be significant towards training the\nmodel, to the FL server for aggregation. Extensive experiments evaluating\nFedOBD against four state-of-the-art approaches based on multiple real-world\ndatasets show that it reduces the overall communication overhead by more than\n88% compared to the best performing baseline approach, while achieving the\nhighest test accuracy. To the best of our knowledge, FedOBD is the first\napproach to perform dropout on FL models at the block level rather than at the\nindividual parameter level.\n","authors":["Yuanyuan Chen","Zichen Chen","Pengcheng Wu","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2208.05174v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v1","updated":"2023-08-07T14:36:03Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v1.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2303.12642v3","updated":"2023-08-07T14:29:03Z","published":"2023-03-22T15:23:22Z","title":"Democratising AI: Multiple Meanings, Goals, and Methods","summary":"  Numerous parties are calling for the democratisation of AI, but the phrase is\nused to refer to a variety of goals, the pursuit of which sometimes conflict.\nThis paper identifies four kinds of AI democratisation that are commonly\ndiscussed: (1) the democratisation of AI use, (2) the democratisation of AI\ndevelopment, (3) the democratisation of AI profits, and (4) the democratisation\nof AI governance. Numerous goals and methods of achieving each form of\ndemocratisation are discussed. The main takeaway from this paper is that AI\ndemocratisation is a multifarious and sometimes conflicting concept that should\nnot be conflated with improving AI accessibility. If we want to move beyond\nambiguous commitments to democratising AI, to productive discussions of\nconcrete policies and trade-offs, then we need to recognise the principal role\nof the democratisation of AI governance in navigating tradeoffs and risks\nacross decisions around use, development, and profits.\n","authors":["Elizabeth Seger","Aviv Ovadya","Ben Garfinkel","Divya Siddarth","Allan Dafoe"],"pdf_url":"https://arxiv.org/pdf/2303.12642v3.pdf","comment":"V2 Changed second author affiliation; added citation to section 5.2;\n  edit to author contribution statement; V3 camera ready version for conference\n  proceedings. Minor content changes in response to reviewer comments"},{"id":"http://arxiv.org/abs/2308.03613v1","updated":"2023-08-07T14:16:52Z","published":"2023-08-07T14:16:52Z","title":"Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous\n  Labels","summary":"  Accurate segmentation of brain vessels is crucial for cerebrovascular disease\ndiagnosis and treatment. However, existing methods face challenges in capturing\nsmall vessels and handling datasets that are partially or ambiguously\nannotated. In this paper, we propose an adaptive semi-supervised approach to\naddress these challenges. Our approach incorporates innovative techniques\nincluding progressive semi-supervised learning, adaptative training strategy,\nand boundary enhancement. Experimental results on 3DRA datasets demonstrate the\nsuperiority of our method in terms of mesh-based segmentation metrics. By\nleveraging the partially and ambiguously labeled data, which only annotates the\nmain vessels, our method achieves impressive segmentation performance on\nmislabeled fine vessels, showcasing its potential for clinical applications.\n","authors":["Fengming Lin","Yan Xia","Nishant Ravikumar","Qiongyao Liu","Michael MacRaild","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2308.03613v1.pdf","comment":"Accepted by DALI MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03574v1","updated":"2023-08-07T13:25:48Z","published":"2023-08-07T13:25:48Z","title":"Generalized Early Stopping in Evolutionary Direct Policy Search","summary":"  Lengthy evaluation times are common in many optimization problems such as\ndirect policy search tasks, especially when they involve conducting evaluations\nin the physical world, e.g. in robotics applications. Often, when evaluating a\nsolution over a fixed time period, it becomes clear that the objective value\nwill not increase with additional computation time (for example, when a\ntwo-wheeled robot continuously spins on the spot). In such cases, it makes\nsense to stop the evaluation early to save computation time. However, most\napproaches to stop the evaluation are problem-specific and need to be\nspecifically designed for the task at hand. Therefore, we propose an early\nstopping method for direct policy search. The proposed method only looks at the\nobjective value at each time step and requires no problem-specific knowledge.\n  We test the introduced stopping criterion in five direct policy search\nenvironments drawn from games, robotics, and classic control domains, and show\nthat it can save up to 75% of the computation time. We also compare it with\nproblem-specific stopping criteria and demonstrate that it performs comparably\nwhile being more generally applicable.\n","authors":["Etor Arza","Leni K. Le Goff","Emma Hart"],"pdf_url":"https://arxiv.org/pdf/2308.03574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03573v1","updated":"2023-08-07T13:24:52Z","published":"2023-08-07T13:24:52Z","title":"When Federated Learning meets Watermarking: A Comprehensive Overview of\n  Techniques for Intellectual Property Protection","summary":"  Federated Learning (FL) is a technique that allows multiple participants to\ncollaboratively train a Deep Neural Network (DNN) without the need of\ncentralizing their data. Among other advantages, it comes with\nprivacy-preserving properties making it attractive for application in sensitive\ncontexts, such as health care or the military. Although the data are not\nexplicitly exchanged, the training procedure requires sharing information about\nparticipants' models. This makes the individual models vulnerable to theft or\nunauthorized distribution by malicious actors. To address the issue of\nownership rights protection in the context of Machine Learning (ML), DNN\nWatermarking methods have been developed during the last five years. Most\nexisting works have focused on watermarking in a centralized manner, but only a\nfew methods have been designed for FL and its unique constraints. In this\npaper, we provide an overview of recent advancements in Federated Learning\nwatermarking, shedding light on the new challenges and opportunities that arise\nin this field.\n","authors":["Mohammed Lansari","Reda Bellafqira","Katarzyna Kapusta","Vincent Thouvenot","Olivier Bettan","Gouenou Coatrieux"],"pdf_url":"https://arxiv.org/pdf/2308.03573v1.pdf","comment":"2figures, 14pages, 3tables"},{"id":"http://arxiv.org/abs/2308.03572v1","updated":"2023-08-07T13:24:50Z","published":"2023-08-07T13:24:50Z","title":"Provably Efficient Learning in Partially Observable Contextual Bandit","summary":"  In this paper, we investigate transfer learning in partially observable\ncontextual bandits, where agents have limited knowledge from other agents and\npartial information about hidden confounders. We first convert the problem to\nidentifying or partially identifying causal effects between actions and rewards\nthrough optimization problems. To solve these optimization problems, we\ndiscretize the original functional constraints of unknown distributions into\nlinear constraints, and sample compatible causal models via sequentially\nsolving linear programmings to obtain causal bounds with the consideration of\nestimation error. Our sampling algorithms provide desirable convergence results\nfor suitable sampling distributions. We then show how causal bounds can be\napplied to improving classical bandit algorithms and affect the regrets with\nrespect to the size of action sets and function spaces. Notably, in the task\nwith function approximation which allows us to handle general context\ndistributions, our method improves the order dependence on function space size\ncompared with previous literatures. We formally prove that our causally\nenhanced algorithms outperform classical bandit algorithms and achieve orders\nof magnitude faster convergence rates. Finally, we perform simulations that\ndemonstrate the efficiency of our strategy compared to the current\nstate-of-the-art methods. This research has the potential to enhance the\nperformance of contextual bandit agents in real-world applications where data\nis scarce and costly to obtain.\n","authors":["Xueping Gong","Jiheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03572v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.03104 by other authors"},{"id":"http://arxiv.org/abs/2206.08083v4","updated":"2023-08-07T13:24:06Z","published":"2022-06-16T10:53:18Z","title":"CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation\n  from Simulation to multiple Real-World Domains","summary":"  Unsupervised Domain Adaptation demonstrates great potential to mitigate\ndomain shifts by transferring models from labeled source domains to unlabeled\ntarget domains. While Unsupervised Domain Adaptation has been applied to a wide\nvariety of complex vision tasks, only few works focus on lane detection for\nautonomous driving. This can be attributed to the lack of publicly available\ndatasets. To facilitate research in these directions, we propose CARLANE, a\n3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE\nencompasses the single-target datasets MoLane and TuLane and the multi-target\ndataset MuLane. These datasets are built from three different domains, which\ncover diverse scenes and contain a total of 163K unique images, 118K of which\nare annotated. In addition we evaluate and report systematic baselines,\nincluding our own method, which builds upon Prototypical Cross-domain\nSelf-supervised Learning. We find that false positive and false negative rates\nof the evaluated domain adaptation methods are high compared to those of fully\nsupervised baselines. This affirms the need for benchmarks such as CARLANE to\nfurther strengthen research in Unsupervised Domain Adaptation for lane\ndetection. CARLANE, all evaluated models and the corresponding implementations\nare publicly available at https://carlanebenchmark.github.io.\n","authors":["Julian Gebele","Bonifaz Stuhr","Johann Haselberger"],"pdf_url":"https://arxiv.org/pdf/2206.08083v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.12375v2","updated":"2023-08-07T13:22:01Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03570v1","updated":"2023-08-07T13:21:58Z","published":"2023-08-07T13:21:58Z","title":"Partial identification of kernel based two sample tests with mismeasured\n  data","summary":"  Nonparametric two-sample tests such as the Maximum Mean Discrepancy (MMD) are\noften used to detect differences between two distributions in machine learning\napplications. However, the majority of existing literature assumes that\nerror-free samples from the two distributions of interest are available.We\nrelax this assumption and study the estimation of the MMD under\n$\\epsilon$-contamination, where a possibly non-random $\\epsilon$ proportion of\none distribution is erroneously grouped with the other. We show that under\n$\\epsilon$-contamination, the typical estimate of the MMD is unreliable.\nInstead, we study partial identification of the MMD, and characterize sharp\nupper and lower bounds that contain the true, unknown MMD. We propose a method\nto estimate these bounds, and show that it gives estimates that converge to the\nsharpest possible bounds on the MMD as sample size increases, with a\nconvergence rate that is faster than alternative approaches. Using three\ndatasets, we empirically validate that our approach is superior to the\nalternatives: it gives tight bounds with a low false coverage rate.\n","authors":["Ron Nafshi","Maggie Makar"],"pdf_url":"https://arxiv.org/pdf/2308.03570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10923v2","updated":"2023-08-07T12:58:57Z","published":"2022-06-22T09:02:42Z","title":"FairGrad: Fairness Aware Gradient Descent","summary":"  We address the problem of group fairness in classification, where the\nobjective is to learn models that do not unjustly discriminate against\nsubgroups of the population. Most existing approaches are limited to simple\nbinary tasks or involve difficult to implement training mechanisms which\nreduces their practical applicability. In this paper, we propose FairGrad, a\nmethod to enforce fairness based on a re-weighting scheme that iteratively\nlearns group specific weights based on whether they are advantaged or not.\nFairGrad is easy to implement, accommodates various standard fairness\ndefinitions, and comes with minimal overhead. Furthermore, we show that it is\ncompetitive with standard baselines over various datasets including ones used\nin natural language processing and computer vision.\n  FairGrad is available as a PyPI package at -\nhttps://pypi.org/project/fairgrad\n","authors":["Gaurav Maheshwari","Michaël Perrot"],"pdf_url":"https://arxiv.org/pdf/2206.10923v2.pdf","comment":"Paper is accepted at Transactions on Machine Learning Research.\n  Reviewed on OpenReview: https://openreview.net/forum?id=0f8tU3QwWD"},{"id":"http://arxiv.org/abs/2308.03542v1","updated":"2023-08-07T12:44:10Z","published":"2023-08-07T12:44:10Z","title":"A Transfer Learning Framework for Proactive Ramp Metering Performance\n  Assessment","summary":"  Transportation agencies need to assess ramp metering performance when\ndeploying or expanding a ramp metering system. The evaluation of a ramp\nmetering strategy is primarily centered around examining its impact on freeway\ntraffic mobility. One way these effects can be explored is by comparing traffic\nstates, such as the speed before and after the ramp metering strategy has been\naltered. Predicting freeway traffic states for the after scenarios following\nthe implementation of a new ramp metering control strategy could offer valuable\ninsights into the potential effectiveness of the target strategy. However, the\nuse of machine learning methods in predicting the freeway traffic state for the\nafter scenarios and evaluating the effectiveness of transportation policies or\ntraffic control strategies such as ramp metering is somewhat limited in the\ncurrent literature. To bridge the research gap, this study presents a framework\nfor predicting freeway traffic parameters (speed, occupancy, and flow rate) for\nthe after situations when a new ramp metering control strategy is implemented.\nBy learning the association between the spatial-temporal features of traffic\nstates in before and after situations for known freeway segments, the proposed\nframework can transfer this learning to predict the traffic parameters for new\nfreeway segments. The proposed framework is built upon a transfer learning\nmodel. Experimental results show that the proposed framework is feasible for\nuse as an alternative for predicting freeway traffic parameters to proactively\nevaluate ramp metering performance.\n","authors":["Xiaobo Ma","Adrian Cottam","Mohammad Razaur Rahman Shaon","Yao-Jan Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03538v1","updated":"2023-08-07T12:36:30Z","published":"2023-08-07T12:36:30Z","title":"On-ramp and Off-ramp Traffic Flows Estimation Based on A Data-driven\n  Transfer Learning Framework","summary":"  To develop the most appropriate control strategy and monitor, maintain, and\nevaluate the traffic performance of the freeway weaving areas, state and local\nDepartments of Transportation need to have access to traffic flows at each pair\nof on-ramp and off-ramp. However, ramp flows are not always readily available\nto transportation agencies and little effort has been made to estimate these\nmissing flows in locations where no physical sensors are installed. To bridge\nthis research gap, a data-driven framework is proposed that can accurately\nestimate the missing ramp flows by solely using data collected from loop\ndetectors on freeway mainlines. The proposed framework employs a transfer\nlearning model. The transfer learning model relaxes the assumption that the\nunderlying data distributions of the source and target domains must be the\nsame. Therefore, the proposed framework can guarantee high-accuracy estimation\nof on-ramp and off-ramp flows on freeways with different traffic patterns,\ndistributions, and characteristics. Based on the experimental results, the flow\nestimation mean absolute errors range between 23.90 veh/h to 40.85 veh/h for\non-ramps, and 31.58 veh/h to 45.31 veh/h for off-ramps; the flow estimation\nroot mean square errors range between 34.55 veh/h to 57.77 veh/h for on-ramps,\nand 41.75 veh/h to 58.80 veh/h for off-ramps. Further, the comparison analysis\nshows that the proposed framework outperforms other conventional machine\nlearning models. The estimated ramp flows based on the proposed method can help\ntransportation agencies to enhance the operations of their ramp control\nstrategies for locations where physical sensors are not installed.\n","authors":["Xiaobo Ma","Abolfazl Karimpour","Yao-Jan Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.11577v4","updated":"2023-08-07T12:33:20Z","published":"2021-08-26T04:42:24Z","title":"Machine Unlearning of Features and Labels","summary":"  Removing information from a machine learning model is a non-trivial task that\nrequires to partially revert the training process. This task is unavoidable\nwhen sensitive data, such as credit card numbers or passwords, accidentally\nenter the model and need to be removed afterwards. Recently, different concepts\nfor machine unlearning have been proposed to address this problem. While these\napproaches are effective in removing individual data points, they do not scale\nto scenarios where larger groups of features and labels need to be reverted. In\nthis paper, we propose the first method for unlearning features and labels. Our\napproach builds on the concept of influence functions and realizes unlearning\nthrough closed-form updates of model parameters. It enables to adapt the\ninfluence of training data on a learning model retrospectively, thereby\ncorrecting data leaks and privacy issues. For learning models with strongly\nconvex loss functions, our method provides certified unlearning with\ntheoretical guarantees. For models with non-convex losses, we empirically show\nthat unlearning features and labels is effective and significantly faster than\nother strategies.\n","authors":["Alexander Warnecke","Lukas Pirch","Christian Wressnegger","Konrad Rieck"],"pdf_url":"https://arxiv.org/pdf/2108.11577v4.pdf","comment":"Network and Distributed System Security Symposium (NDSS) 2023"},{"id":"http://arxiv.org/abs/2308.03530v1","updated":"2023-08-07T12:27:19Z","published":"2023-08-07T12:27:19Z","title":"Deep Feature Learning for Wireless Spectrum Data","summary":"  In recent years, the traditional feature engineering process for training\nmachine learning models is being automated by the feature extraction layers\nintegrated in deep learning architectures. In wireless networks, many studies\nwere conducted in automatic learning of feature representations for\ndomain-related challenges. However, most of the existing works assume some\nsupervision along the learning process by using labels to optimize the model.\nIn this paper, we investigate an approach to learning feature representations\nfor wireless transmission clustering in a completely unsupervised manner, i.e.\nrequiring no labels in the process. We propose a model based on convolutional\nneural networks that automatically learns a reduced dimensionality\nrepresentation of the input data with 99.3% less components compared to a\nbaseline principal component analysis (PCA). We show that the automatic\nrepresentation learning is able to extract fine-grained clusters containing the\nshapes of the wireless transmission bursts, while the baseline enables only\ngeneral separability of the data based on the background noise.\n","authors":["Ljupcho Milosheski","Gregor Cerar","Blaž Bertalanič","Carolina Fortuna","Mihael Mohorčič"],"pdf_url":"https://arxiv.org/pdf/2308.03530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03526v1","updated":"2023-08-07T12:21:37Z","published":"2023-08-07T12:21:37Z","title":"AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning","summary":"  StarCraft II is one of the most challenging simulated reinforcement learning\nenvironments; it is partially observable, stochastic, multi-agent, and\nmastering StarCraft II requires strategic planning over long time horizons with\nreal-time low-level execution. It also has an active professional competitive\nscene. StarCraft II is uniquely suited for advancing offline RL algorithms,\nboth because of its challenging nature and because Blizzard has released a\nmassive dataset of millions of StarCraft II games played by human players. This\npaper leverages that and establishes a benchmark, called AlphaStar Unplugged,\nintroducing unprecedented challenges for offline reinforcement learning. We\ndefine a dataset (a subset of Blizzard's release), tools standardizing an API\nfor machine learning methods, and an evaluation protocol. We also present\nbaseline agents, including behavior cloning, offline variants of actor-critic\nand MuZero. We improve the state of the art of agents using only offline data,\nand we achieve 90% win rate against previously published AlphaStar behavior\ncloning agent.\n","authors":["Michaël Mathieu","Sherjil Ozair","Srivatsan Srinivasan","Caglar Gulcehre","Shangtong Zhang","Ray Jiang","Tom Le Paine","Richard Powell","Konrad Żołna","Julian Schrittwieser","David Choi","Petko Georgiev","Daniel Toyama","Aja Huang","Roman Ring","Igor Babuschkin","Timo Ewalds","Mahyar Bordbar","Sarah Henderson","Sergio Gómez Colmenarejo","Aäron van den Oord","Wojciech Marian Czarnecki","Nando de Freitas","Oriol Vinyals"],"pdf_url":"https://arxiv.org/pdf/2308.03526v1.pdf","comment":"32 pages, 13 figures, previous version published as a NeurIPS 2021\n  workshop: https://openreview.net/forum?id=Np8Pumfoty"},{"id":"http://arxiv.org/abs/2308.03514v1","updated":"2023-08-07T12:10:13Z","published":"2023-08-07T12:10:13Z","title":"Worker Activity Recognition in Manufacturing Line Using Near-body\n  Electric Field","summary":"  Manufacturing industries strive to improve production efficiency and product\nquality by deploying advanced sensing and control systems. Wearable sensors are\nemerging as a promising solution for achieving this goal, as they can provide\ncontinuous and unobtrusive monitoring of workers' activities in the\nmanufacturing line. This paper presents a novel wearable sensing prototype that\ncombines IMU and body capacitance sensing modules to recognize worker\nactivities in the manufacturing line. To handle these multimodal sensor data,\nwe propose and compare early, and late sensor data fusion approaches for\nmulti-channel time-series convolutional neural networks and deep convolutional\nLSTM. We evaluate the proposed hardware and neural network model by collecting\nand annotating sensor data using the proposed sensing prototype and Apple\nWatches in the testbed of the manufacturing line. Experimental results\ndemonstrate that our proposed methods achieve superior performance compared to\nthe baseline methods, indicating the potential of the proposed approach for\nreal-world applications in manufacturing industries. Furthermore, the proposed\nsensing prototype with a body capacitive sensor and feature fusion method\nimproves by 6.35%, yielding a 9.38% higher macro F1 score than the proposed\nsensing prototype without a body capacitive sensor and Apple Watch data,\nrespectively.\n","authors":["Sungho Suh","Vitor Fortes Rey","Sizhen Bian","Yu-Chi Huang","Jože M. Rožanec","Hooman Tavakoli Ghinani","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2308.03514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08674v3","updated":"2023-08-07T12:08:17Z","published":"2023-07-17T17:36:09Z","title":"TableGPT: Towards Unifying Tables, Nature Language and Commands into One\n  GPT","summary":"  Tables are prevalent in real-world databases, requiring significant time and\neffort for humans to analyze and manipulate. The advancements in large language\nmodels (LLMs) have made it possible to interact with tables using natural\nlanguage input, bringing this capability closer to reality. In this paper, we\npresent TableGPT, a unified fine-tuned framework that enables LLMs to\nunderstand and operate on tables using external functional commands. It\nintroduces the capability to seamlessly interact with tables, enabling a wide\nrange of functionalities such as question answering, data manipulation (e.g.,\ninsert, delete, query, and modify operations), data visualization, analysis\nreport generation, and automated prediction. TableGPT aims to provide\nconvenience and accessibility to users by empowering them to effortlessly\nleverage tabular data. At the core of TableGPT lies the novel concept of global\ntabular representations, which empowers LLMs to gain a comprehensive\nunderstanding of the entire table beyond meta-information. By jointly training\nLLMs on both table and text modalities, TableGPT achieves a deep understanding\nof tabular data and the ability to perform complex operations on tables through\nchain-of-command instructions. Importantly, TableGPT offers the advantage of\nbeing a self-contained system rather than relying on external API interfaces.\nMoreover, it supports efficient data process flow, query rejection (when\nappropriate) and private deployment, enabling faster domain data fine-tuning\nand ensuring data privacy, which enhances the framework's adaptability to\nspecific use cases.\n","authors":["Liangyu Zha","Junlin Zhou","Liyao Li","Rui Wang","Qingyi Huang","Saisai Yang","Jing Yuan","Changbao Su","Xiang Li","Aofeng Su","Tao Zhang","Chen Zhou","Kaizhe Shou","Miao Wang","Wufang Zhu","Guoshan Lu","Chao Ye","Yali Ye","Wentao Ye","Yiming Zhang","Xinglong Deng","Jie Xu","Haobo Wang","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.08674v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2302.00025v2","updated":"2023-08-07T12:06:43Z","published":"2023-01-31T19:00:28Z","title":"On the Within-Group Fairness of Screening Classifiers","summary":"  Screening classifiers are increasingly used to identify qualified candidates\nin a variety of selection processes. In this context, it has been recently\nshown that, if a classifier is calibrated, one can identify the smallest set of\ncandidates which contains, in expectation, a desired number of qualified\ncandidates using a threshold decision rule. This lends support to focusing on\ncalibration as the only requirement for screening classifiers. In this paper,\nwe argue that screening policies that use calibrated classifiers may suffer\nfrom an understudied type of within-group unfairness -- they may unfairly treat\nqualified members within demographic groups of interest. Further, we argue that\nthis type of unfairness can be avoided if classifiers satisfy within-group\nmonotonicity, a natural monotonicity property within each of the groups. Then,\nwe introduce an efficient post-processing algorithm based on dynamic\nprogramming to minimally modify a given calibrated classifier so that its\nprobability estimates satisfy within-group monotonicity. We validate our\nalgorithm using US Census survey data and show that within-group monotonicity\ncan be often achieved at a small cost in terms of prediction granularity and\nshortlist size.\n","authors":["Nastaran Okati","Stratis Tsirtsis","Manuel Gomez Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2302.00025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14311v3","updated":"2023-08-07T12:06:19Z","published":"2023-02-28T05:01:01Z","title":"Towards Memory- and Time-Efficient Backpropagation for Training Spiking\n  Neural Networks","summary":"  Spiking Neural Networks (SNNs) are promising energy-efficient models for\nneuromorphic computing. For training the non-differentiable SNN models, the\nbackpropagation through time (BPTT) with surrogate gradients (SG) method has\nachieved high performance. However, this method suffers from considerable\nmemory cost and training time during training. In this paper, we propose the\nSpatial Learning Through Time (SLTT) method that can achieve high performance\nwhile greatly improving training efficiency compared with BPTT. First, we show\nthat the backpropagation of SNNs through the temporal domain contributes just a\nlittle to the final calculated gradients. Thus, we propose to ignore the\nunimportant routes in the computational graph during backpropagation. The\nproposed method reduces the number of scalar multiplications and achieves a\nsmall memory occupation that is independent of the total time steps.\nFurthermore, we propose a variant of SLTT, called SLTT-K, that allows\nbackpropagation only at K time steps, then the required number of scalar\nmultiplications is further reduced and is independent of the total time steps.\nExperiments on both static and neuromorphic datasets demonstrate superior\ntraining efficiency and performance of our SLTT. In particular, our method\nachieves state-of-the-art accuracy on ImageNet, while the memory cost and\ntraining time are reduced by more than 70% and 50%, respectively, compared with\nBPTT.\n","authors":["Qingyan Meng","Mingqing Xiao","Shen Yan","Yisen Wang","Zhouchen Lin","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2302.14311v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03511v1","updated":"2023-08-07T12:05:55Z","published":"2023-08-07T12:05:55Z","title":"A data-driven approach to predict decision point choice during normal\n  and evacuation wayfinding in multi-story buildings","summary":"  Understanding pedestrian route choice behavior in complex buildings is\nimportant to ensure pedestrian safety. Previous studies have mostly used\ntraditional data collection methods and discrete choice modeling to understand\nthe influence of different factors on pedestrian route and exit choice,\nparticularly in simple indoor environments. However, research on pedestrian\nroute choice in complex buildings is still limited. This paper presents a\ndata-driven approach for understanding and predicting the pedestrian decision\npoint choice during normal and emergency wayfinding in a multi-story building.\nFor this, we first built an indoor network representation and proposed a data\nmapping technique to map VR coordinates to the indoor representation. We then\nused a well-established machine learning algorithm, namely the random forest\n(RF) model to predict pedestrian decision point choice along a route during\nfour wayfinding tasks in a multi-story building. Pedestrian behavioral data in\na multi-story building was collected by a Virtual Reality experiment. The\nresults show a much higher prediction accuracy of decision points using the RF\nmodel (i.e., 93% on average) compared to the logistic regression model. The\nhighest prediction accuracy was 96% for task 3. Additionally, we tested the\nmodel performance combining personal characteristics and we found that personal\ncharacteristics did not affect decision point choice. This paper demonstrates\nthe potential of applying a machine learning algorithm to study pedestrian\nroute choice behavior in complex indoor buildings.\n","authors":["Yan Feng","Panchamy Krishnakumari"],"pdf_url":"https://arxiv.org/pdf/2308.03511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12377v4","updated":"2023-08-07T12:02:40Z","published":"2022-07-25T17:46:09Z","title":"A novel Deep Learning approach for one-step Conformal Prediction\n  approximation","summary":"  Deep Learning predictions with measurable confidence are increasingly\ndesirable for real-world problems, especially in high-risk settings. The\nConformal Prediction (CP) framework is a versatile solution that guarantees a\nmaximum error rate given minimal constraints. In this paper, we propose a novel\nconformal loss function that approximates the traditionally two-step CP\napproach in a single step. By evaluating and penalising deviations from the\nstringent expected CP output distribution, a Deep Learning model may learn the\ndirect relationship between the input data and the conformal p-values. We carry\nout a comprehensive empirical evaluation to show our novel loss function's\ncompetitiveness for seven binary and multi-class prediction tasks on five\nbenchmark datasets. On the same datasets, our approach achieves significant\ntraining time reductions up to 86% compared to Aggregated Conformal Prediction\n(ACP), while maintaining comparable approximate validity and predictive\nefficiency.\n","authors":["Julia A. Meister","Khuong An Nguyen","Stelios Kapetanakis","Zhiyuan Luo"],"pdf_url":"https://arxiv.org/pdf/2207.12377v4.pdf","comment":"34 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.03495v1","updated":"2023-08-07T11:42:50Z","published":"2023-08-07T11:42:50Z","title":"Balanced Face Dataset: Guiding StyleGAN to Generate Labeled Synthetic\n  Face Image Dataset for Underrepresented Group","summary":"  For a machine learning model to generalize effectively to unseen data within\na particular problem domain, it is well-understood that the data needs to be of\nsufficient size and representative of real-world scenarios. Nonetheless,\nreal-world datasets frequently have overrepresented and underrepresented\ngroups. One solution to mitigate bias in machine learning is to leverage a\ndiverse and representative dataset. Training a model on a dataset that covers\nall demographics is crucial to reducing bias in machine learning. However,\ncollecting and labeling large-scale datasets has been challenging, prompting\nthe use of synthetic data generation and active labeling to decrease the costs\nof manual labeling. The focus of this study was to generate a robust face image\ndataset using the StyleGAN model. In order to achieve a balanced distribution\nof the dataset among different demographic groups, a synthetic dataset was\ncreated by controlling the generation process of StyleGaN and annotated for\ndifferent downstream tasks.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2308.03495v1.pdf","comment":"7 pages, 7 figures,submitted to AMLD Africa 2021 conference"},{"id":"http://arxiv.org/abs/2208.00953v2","updated":"2023-08-07T11:18:47Z","published":"2022-08-01T16:05:14Z","title":"Visual Interpretable and Explainable Deep Learning Models for Brain\n  Tumor MRI and COVID-19 Chest X-ray Images","summary":"  Deep learning shows promise for medical image analysis but lacks\ninterpretability, hindering adoption in healthcare. Attribution techniques that\nexplain model reasoning may increase trust in deep learning among clinical\nstakeholders. This paper aimed to evaluate attribution methods for illuminating\nhow deep neural networks analyze medical images. Using adaptive path-based\ngradient integration, we attributed predictions from brain tumor MRI and\nCOVID-19 chest X-ray datasets made by recent deep convolutional neural network\nmodels. The technique highlighted possible biomarkers, exposed model biases,\nand offered insights into the links between input and prediction. Our analysis\ndemonstrates the method's ability to elucidate model reasoning on these\ndatasets. The resulting attributions show promise for improving deep learning\ntransparency for domain experts by revealing the rationale behind predictions.\nThis study advances model interpretability to increase trust in deep learning\namong healthcare stakeholders.\n","authors":["Yusuf Brima","Marcellin Atemkeng"],"pdf_url":"https://arxiv.org/pdf/2208.00953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03345v3","updated":"2023-08-07T11:16:08Z","published":"2023-01-09T13:56:59Z","title":"Latent Spectral Regularization for Continual Learning","summary":"  While biological intelligence grows organically as new knowledge is gathered\nthroughout life, Artificial Neural Networks forget catastrophically whenever\nthey face a changing training data distribution. Rehearsal-based Continual\nLearning (CL) approaches have been established as a versatile and reliable\nsolution to overcome this limitation; however, sudden input disruptions and\nmemory constraints are known to alter the consistency of their predictions. We\nstudy this phenomenon by investigating the geometric characteristics of the\nlearner's latent space and find that replayed data points of different classes\nincreasingly mix up, interfering with classification. Hence, we propose a\ngeometric regularizer that enforces weak requirements on the Laplacian spectrum\nof the latent space, promoting a partitioning behavior. We show that our\nproposal, called Continual Spectral Regularizer (CaSpeR), can be easily\ncombined with any rehearsal-based CL approach and improves the performance of\nSOTA methods on standard benchmarks. Finally, we conduct additional analysis to\nprovide insights into CaSpeR's effects and applicability.\n","authors":["Emanuele Frascaroli","Riccardo Benaglia","Matteo Boschini","Luca Moschella","Cosimo Fiorini","Emanuele Rodolà","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2301.03345v3.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.03476v1","updated":"2023-08-07T11:09:12Z","published":"2023-08-07T11:09:12Z","title":"Exploring the Physical World Adversarial Robustness of Vehicle Detection","summary":"  Adversarial attacks can compromise the robustness of real-world detection\nmodels. However, evaluating these models under real-world conditions poses\nchallenges due to resource-intensive experiments. Virtual simulations offer an\nalternative, but the absence of standardized benchmarks hampers progress.\nAddressing this, we propose an innovative instant-level data generation\npipeline using the CARLA simulator. Through this pipeline, we establish the\nDiscrete and Continuous Instant-level (DCI) dataset, enabling comprehensive\nexperiments involving three detection models and three physical adversarial\nattacks. Our findings highlight diverse model performances under adversarial\nconditions. Yolo v6 demonstrates remarkable resilience, experiencing just a\nmarginal 6.59% average drop in average precision (AP). In contrast, the ASA\nattack yields a substantial 14.51% average AP reduction, twice the effect of\nother algorithms. We also note that static scenes yield higher recognition AP\nvalues, and outcomes remain relatively consistent across varying weather\nconditions. Intriguingly, our study suggests that advancements in adversarial\nattack algorithms may be approaching its ``limitation''.In summary, our work\nunderscores the significance of adversarial attacks in real-world contexts and\nintroduces the DCI dataset as a versatile benchmark. Our findings provide\nvaluable insights for enhancing the robustness of detection models and offer\nguidance for future research endeavors in the realm of adversarial attacks.\n","authors":["Wei Jiang","Tianyuan Zhang","Shuangcheng Liu","Weiyu Ji","Zichao Zhang","Gang Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03472v1","updated":"2023-08-07T11:02:44Z","published":"2023-08-07T11:02:44Z","title":"How to forecast power generation in wind farms? Insights from leveraging\n  hierarchical structure","summary":"  Forecasting of renewable energy generation provides key insights which may\nhelp with decision-making towards global decarbonisation. Renewable energy\ngeneration can often be represented through cross-sectional hierarchies,\nwhereby a single farm may have multiple individual generators. Hierarchical\nforecasting through reconciliation has demonstrated a significant increase in\nthe quality of forecasts both theoretically and empirically. However, it is not\nevident whether forecasts generated by individual temporal and cross-sectional\naggregation can be superior to integrated cross-temporal forecasts and to\nindividual forecasts on more granular data. In this study, we investigate the\naccuracies of different cross-sectional and cross-temporal reconciliation\nmethods using both linear regression and gradient boosting machine learning for\nforecasting wind farm power generation. We found that cross-temporal\nreconciliation is superior to individual cross-sectional reconciliation at\nmultiple temporal aggregations. Cross-temporally reconciled machine learning\nbase forecasts also demonstrated a high accuracy at coarser temporal\ngranularities, which may encourage adoption for short-term wind forecasts. We\nalso show that linear regression can outperform machine learning models across\nmost levels in cross-sectional wind time series.\n","authors":["Lucas English","Mahdi Abolghasemi"],"pdf_url":"https://arxiv.org/pdf/2308.03472v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.08432v2","updated":"2023-08-07T10:58:21Z","published":"2023-06-14T11:02:08Z","title":"Batches Stabilize the Minimum Norm Risk in High Dimensional\n  Overparameterized Linear Regression","summary":"  Learning algorithms that divide the data into batches are prevalent in many\nmachine-learning applications, typically offering useful trade-offs between\ncomputational efficiency and performance. In this paper, we examine the\nbenefits of batch-partitioning through the lens of a minimum-norm\noverparameterized linear regression model with isotropic Gaussian features. We\nsuggest a natural small-batch version of the minimum-norm estimator, and derive\nan upper bound on its quadratic risk, showing it is inversely proportional to\nthe noise level as well as to the overparameterization ratio, for the optimal\nchoice of batch size. In contrast to minimum-norm, our estimator admits a\nstable risk behavior that is monotonically increasing in the\noverparameterization ratio, eliminating both the blowup at the interpolation\npoint and the double-descent phenomenon. Interestingly, we observe that this\nimplicit regularization offered by the batch partition is partially explained\nby feature overlap between the batches. Our bound is derived via a novel\ncombination of techniques, in particular normal approximation in the\nWasserstein metric of noisy projections over random subspaces.\n","authors":["Shahar Stein Ioushua","Inbar Hasidim","Ofer Shayevitz","Meir Feder"],"pdf_url":"https://arxiv.org/pdf/2306.08432v2.pdf","comment":"55 pages"},{"id":"http://arxiv.org/abs/2308.03464v1","updated":"2023-08-07T10:43:48Z","published":"2023-08-07T10:43:48Z","title":"Wide Gaps and Clustering Axioms","summary":"  The widely applied k-means algorithm produces clusterings that violate our\nexpectations with respect to high/low similarity/density and is in conflict\nwith Kleinberg's axiomatic system for distance based clustering algorithms that\nformalizes those expectations in a natural way. k-means violates in particular\nthe consistency axiom. We hypothesise that this clash is due to the not\nexplicated expectation that the data themselves should have the property of\nbeing clusterable in order to expect the algorithm clustering hem to fit a\nclustering axiomatic system. To demonstrate this, we introduce two new\nclusterability properties, variational k-separability and residual\nk-separability and show that then the Kleinberg's consistency axiom holds for\nk-means operating in the Euclidean or non-Euclidean space. Furthermore, we\npropose extensions of k-means algorithm that fit approximately the Kleinberg's\nrichness axiom that does not hold for k-means. In this way, we reconcile\nk-means with Kleinberg's axiomatic framework in Euclidean and non-Euclidean\nsettings. Besides contribution to the theory of axiomatic frameworks of\nclustering and for clusterability theory, practical contribution is the\npossibility to construct {datasets for testing purposes of algorithms\noptimizing k-means cost function. This includes a method of construction of\n{clusterable data with known in advance global optimum.\n","authors":["Mieczysław A. Kłopotek"],"pdf_url":"https://arxiv.org/pdf/2308.03464v1.pdf","comment":"14 Theorems. arXiv admin note: substantial text overlap with\n  arXiv:2211.17036"},{"id":"http://arxiv.org/abs/2308.03457v1","updated":"2023-08-07T10:25:54Z","published":"2023-08-07T10:25:54Z","title":"Cross-Silo Prototypical Calibration for Federated Learning with Non-IID\n  Data","summary":"  Federated Learning aims to learn a global model on the server side that\ngeneralizes to all clients in a privacy-preserving manner, by leveraging the\nlocal models from different clients. Existing solutions focus on either\nregularizing the objective functions among clients or improving the aggregation\nmechanism for the improved model generalization capability. However, their\nperformance is typically limited by the dataset biases, such as the\nheterogeneous data distributions and the missing classes. To address this\nissue, this paper presents a cross-silo prototypical calibration method\n(FedCSPC), which takes additional prototype information from the clients to\nlearn a unified feature space on the server side. Specifically, FedCSPC first\nemploys the Data Prototypical Modeling (DPM) module to learn data patterns via\nclustering to aid calibration. Subsequently, the cross-silo prototypical\ncalibration (CSPC) module develops an augmented contrastive learning method to\nimprove the robustness of the calibration, which can effectively project\ncross-source features into a consistent space while maintaining clear decision\nboundaries. Moreover, the CSPC module's ease of implementation and\nplug-and-play characteristics make it even more remarkable. Experiments were\nconducted on four datasets in terms of performance comparison, ablation study,\nin-depth analysis and case study, and the results verified that FedCSPC is\ncapable of learning the consistent features across different data sources of\nthe same class under the guidance of calibrated model, which leads to better\nperformance than the state-of-the-art methods. The source codes have been\nreleased at https://github.com/qizhuang-qz/FedCSPC.\n","authors":["Zhuang Qi","Lei Meng","Zitan Chen","Han Hu","Hui Lin","Xiangxu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.03457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2306.07886v3","updated":"2023-08-07T10:01:49Z","published":"2023-06-13T16:25:30Z","title":"Symmetry & Critical Points for Symmetric Tensor Decomposition Problems","summary":"  We consider the nonconvex optimization problem associated with the\ndecomposition of a real symmetric tensor into a sum of rank one terms. Use is\nmade of the rich symmetry structure to construct infinite families of critical\npoints represented by Puiseux series in the problem dimension, and so obtain\nprecise analytic estimates on the value of the objective function and the\nHessian spectrum. The results allow an analytic characterization of various\nobstructions to using local optimization methods, revealing in particular a\ncomplex array of saddles and minima differing by their symmetry, structure and\nanalytic properties. A~desirable phenomenon, occurring for all critical points\nconsidered, concerns the number of negative Hessian eigenvalues increasing with\nthe value of the objective function. Our approach makes use of Newton polyhedra\nas well as results from real algebraic geometry, notably the Curve Selection\nLemma, to determine the extremal character of degenerate critical points,\nestablishing in particular the existence of infinite families of third-order\nsaddles which can significantly slow down the optimization process.\n","authors":["Yossi Arjevani","Gal Vinograd"],"pdf_url":"https://arxiv.org/pdf/2306.07886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03443v1","updated":"2023-08-07T10:00:07Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03443v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2301.09930v2","updated":"2023-08-07T09:48:44Z","published":"2023-01-24T11:27:17Z","title":"Quadruple-star systems are not always nested triples: a machine learning\n  approach to dynamical stability","summary":"  The dynamical stability of quadruple-star systems has traditionally been\ntreated as a problem involving two `nested' triples which constitute a\nquadruple. In this novel study, we employed a machine learning algorithm, the\nmulti-layer perceptron (MLP), to directly classify 2+2 and 3+1 quadruples based\non their stability (or long-term boundedness). The training data sets for the\nclassification, comprised of $5\\times10^5$ quadruples each, were integrated\nusing the highly accurate direct $N$-body code MSTAR. We also carried out a\nlimited parameter space study of zero-inclination systems to directly compare\nquadruples to triples. We found that both our quadruple MLP models perform\nbetter than a `nested' triple MLP approach, which is especially significant for\n3+1 quadruples. The classification accuracies for the 2+2 MLP and 3+1 MLP\nmodels are 94% and 93% respectively, while the scores for the `nested' triple\napproach are 88% and 66% respectively. This is a crucial implication for\nquadruple population synthesis studies. Our MLP models, which are very simple\nand almost instantaneous to implement, are available on GitHub, along with\nPython3 scripts to access them.\n","authors":["Pavan Vynatheya","Rosemary A. Mardling","Adrian S. Hamers"],"pdf_url":"https://arxiv.org/pdf/2301.09930v2.pdf","comment":"Accepted for publication by MNRAS"},{"id":"http://arxiv.org/abs/2306.09780v2","updated":"2023-08-07T09:25:55Z","published":"2023-06-16T11:33:47Z","title":"Understanding Deep Generative Models with Generalized Empirical\n  Likelihoods","summary":"  Understanding how well a deep generative model captures a distribution of\nhigh-dimensional data remains an important open challenge. It is especially\ndifficult for certain model classes, such as Generative Adversarial Networks\nand Diffusion Models, whose models do not admit exact likelihoods. In this\nwork, we demonstrate that generalized empirical likelihood (GEL) methods offer\na family of diagnostic tools that can identify many deficiencies of deep\ngenerative models (DGMs). We show, with appropriate specification of moment\nconditions, that the proposed method can identify which modes have been\ndropped, the degree to which DGMs are mode imbalanced, and whether DGMs\nsufficiently capture intra-class diversity. We show how to combine techniques\nfrom Maximum Mean Discrepancy and Generalized Empirical Likelihood to create\nnot only distribution tests that retain per-sample interpretability, but also\nmetrics that include label information. We find that such tests predict the\ndegree of mode dropping and mode imbalance up to 60% better than metrics such\nas improved precision/recall. We provide an implementation at\nhttps://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.\n","authors":["Suman Ravuri","Mélanie Rey","Shakir Mohamed","Marc Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2306.09780v2.pdf","comment":"Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of\n  submissions)"},{"id":"http://arxiv.org/abs/2210.14245v2","updated":"2023-08-07T09:09:48Z","published":"2022-10-25T18:00:25Z","title":"CaloFlow for CaloChallenge Dataset 1","summary":"  CaloFlow is a new and promising approach to fast calorimeter simulation based\non normalizing flows. Applying CaloFlow to the photon and charged pion Geant4\nshowers of Dataset 1 of the Fast Calorimeter Simulation Challenge 2022, we show\nhow it can produce high-fidelity samples with a sampling time that is several\norders of magnitude faster than Geant4. We demonstrate the fidelity of the\nsamples using calorimeter shower images, histograms of high-level features, and\naggregate metrics such as a classifier trained to distinguish CaloFlow from\nGeant4 samples.\n","authors":["Claudius Krause","Ian Pang","David Shih"],"pdf_url":"https://arxiv.org/pdf/2210.14245v2.pdf","comment":"32 pages, 18 figures, v2: updated pion evaluation"},{"id":"http://arxiv.org/abs/2308.03417v1","updated":"2023-08-07T09:08:39Z","published":"2023-08-07T09:08:39Z","title":"PURL: Safe and Effective Sanitization of Link Decoration","summary":"  While privacy-focused browsers have taken steps to block third-party cookies\nand browser fingerprinting, novel tracking methods that bypass existing\ndefenses continue to emerge. Since trackers need to exfiltrate information from\nthe client- to server-side through link decoration regardless of the tracking\ntechnique they employ, a promising orthogonal approach is to detect and\nsanitize tracking information in decorated links. We present PURL, a\nmachine-learning approach that leverages a cross-layer graph representation of\nwebpage execution to safely and effectively sanitize link decoration. Our\nevaluation shows that PURL significantly outperforms existing countermeasures\nin terms of accuracy and reducing website breakage while being robust to common\nevasion techniques. We use PURL to perform a measurement study on top-million\nwebsites. We find that link decorations are widely abused by well-known\nadvertisers and trackers to exfiltrate user information collected from browser\nstorage, email addresses, and scripts involved in fingerprinting.\n","authors":["Shaoor Munir","Patrick Lee","Umar Iqbal","Zubair Shafiq","Sandra Siby"],"pdf_url":"https://arxiv.org/pdf/2308.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12414v3","updated":"2023-08-07T08:54:11Z","published":"2023-03-22T09:23:29Z","title":"Delay-Aware Hierarchical Federated Learning","summary":"  Federated learning has gained popularity as a means of training models\ndistributed across the wireless edge. The paper introduces delay-aware\nhierarchical federated learning (DFL) to improve the efficiency of distributed\nmachine learning (ML) model training by accounting for communication delays\nbetween edge and cloud. Different from traditional federated learning, DFL\nleverages multiple stochastic gradient descent iterations on device datasets\nwithin each global aggregation period and intermittently aggregates model\nparameters through edge servers in local subnetworks. During global\nsynchronization, the cloud server consolidates local models with the outdated\nglobal model using a local-global combiner, thus preserving crucial elements of\nboth, enhancing learning efficiency under the presence of delay. A set of\nconditions is obtained to achieve the sub-linear convergence rate of O(1/k).\nBased on these findings, an adaptive control algorithm is developed for DFL,\nimplementing policies to mitigate energy consumption and communication latency\nwhile aiming for a sublinear convergence rate. Numerical evaluations show DFL's\nsuperior performance in terms of faster global model convergence, reduced\nresource consumption, and robustness against communication delays compared to\nexisting FL algorithms. In summary, this proposed method offers improved\nefficiency and results when dealing with both convex and non-convex loss\nfunctions.\n","authors":["Frank Po-Chen Lin","Seyyedali Hosseinalipour","Nicolò Michelusi","Christopher Brinton"],"pdf_url":"https://arxiv.org/pdf/2303.12414v3.pdf","comment":"A condensed version of this paper was presented at IEEE Globecom 2020"},{"id":"http://arxiv.org/abs/2308.03404v1","updated":"2023-08-07T08:46:10Z","published":"2023-08-07T08:46:10Z","title":"Applied metamodelling for ATM performance simulations","summary":"  The use of Air traffic management (ATM) simulators for planing and operations\ncan be challenging due to their modelling complexity. This paper presents XALM\n(eXplainable Active Learning Metamodel), a three-step framework integrating\nactive learning and SHAP (SHapley Additive exPlanations) values into simulation\nmetamodels for supporting ATM decision-making. XALM efficiently uncovers hidden\nrelationships among input and output variables in ATM simulators, those usually\nof interest in policy analysis. Our experiments show XALM's predictive\nperformance comparable to the XGBoost metamodel with fewer simulations.\nAdditionally, XALM exhibits superior explanatory capabilities compared to\nnon-active learning metamodels.\n  Using the `Mercury' (flight and passenger) ATM simulator, XALM is applied to\na real-world scenario in Paris Charles de Gaulle airport, extending an arrival\nmanager's range and scope by analysing six variables. This case study\nillustrates XALM's effectiveness in enhancing simulation interpretability and\nunderstanding variable interactions. By addressing computational challenges and\nimproving explainability, XALM complements traditional simulation-based\nanalyses.\n  Lastly, we discuss two practical approaches for reducing the computational\nburden of the metamodelling further: we introduce a stopping criterion for\nactive learning based on the inherent uncertainty of the metamodel, and we show\nhow the simulations used for the metamodel can be reused across key performance\nindicators, thus decreasing the overall number of simulations needed.\n","authors":["Christoffer Riis","Francisco N. Antunes","Tatjana Bolić","Gérald Gurtner","Andrew Cook","Carlos Lima Azevedo","Francisco Câmara Pereira"],"pdf_url":"https://arxiv.org/pdf/2308.03404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03403v1","updated":"2023-08-07T08:44:15Z","published":"2023-08-07T08:44:15Z","title":"Towards Machine Learning-based Fish Stock Assessment","summary":"  The accurate assessment of fish stocks is crucial for sustainable fisheries\nmanagement. However, existing statistical stock assessment models can have low\nforecast performance of relevant stock parameters like recruitment or spawning\nstock biomass, especially in ecosystems that are changing due to global warming\nand other anthropogenic stressors. In this paper, we investigate the use of\nmachine learning models to improve the estimation and forecast of such stock\nparameters. We propose a hybrid model that combines classical statistical stock\nassessment models with supervised ML, specifically gradient boosted trees. Our\nhybrid model leverages the initial estimate provided by the classical model and\nuses the ML model to make a post-hoc correction to improve accuracy. We\nexperiment with five different stocks and find that the forecast accuracy of\nrecruitment and spawning stock biomass improves considerably in most cases.\n","authors":["Stefan Lüdtke","Maria E. Pierce"],"pdf_url":"https://arxiv.org/pdf/2308.03403v1.pdf","comment":"Accepted at Fragile Earth Workshop 2023"},{"id":"http://arxiv.org/abs/2307.12306v2","updated":"2023-08-07T08:36:45Z","published":"2023-07-23T12:18:12Z","title":"Tackling the Curse of Dimensionality with Physics-Informed Neural\n  Networks","summary":"  The curse-of-dimensionality (CoD) taxes computational resources heavily with\nexponentially increasing computational cost as the dimension increases. This\nposes great challenges in solving high-dimensional PDEs as Richard Bellman\nfirst pointed out over 60 years ago. While there has been some recent success\nin solving numerically partial differential equations (PDEs) in high\ndimensions, such computations are prohibitively expensive, and true scaling of\ngeneral nonlinear PDEs to high dimensions has never been achieved. In this\npaper, we develop a new method of scaling up physics-informed neural networks\n(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called\nStochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs\ninto pieces corresponding to different dimensions and samples randomly a subset\nof these dimensional pieces in each iteration of training PINNs. We\ntheoretically prove the convergence guarantee and other desired properties of\nthe proposed method. We experimentally demonstrate that the proposed method\nallows us to solve many notoriously hard high-dimensional PDEs, including the\nHamilton-Jacobi-Bellman (HJB) and the Schr\\\"{o}dinger equations in thousands of\ndimensions very fast on a single GPU using the PINNs mesh-free approach. For\ninstance, we solve nontrivial nonlinear PDEs (one HJB equation and one\nBlack-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using\nSDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD\ncan be applied to any current and future variants of PINNs to scale them up for\narbitrary high-dimensional PDEs.\n","authors":["Zheyuan Hu","Khemraj Shukla","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.12306v2.pdf","comment":"37 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.03382v1","updated":"2023-08-07T08:03:20Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v1.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2302.02807v2","updated":"2023-08-07T07:43:37Z","published":"2023-02-06T14:31:51Z","title":"Federated Survival Forests","summary":"  Survival analysis is a subfield of statistics concerned with modeling the\noccurrence time of a particular event of interest for a population. Survival\nanalysis found widespread applications in healthcare, engineering, and social\nsciences. However, real-world applications involve survival datasets that are\ndistributed, incomplete, censored, and confidential. In this context, federated\nlearning can tremendously improve the performance of survival analysis\napplications. Federated learning provides a set of privacy-preserving\ntechniques to jointly train machine learning models on multiple datasets\nwithout compromising user privacy, leading to a better generalization\nperformance. However, despite the widespread development of federated learning\nin recent AI research, few studies focus on federated survival analysis. In\nthis work, we present a novel federated algorithm for survival analysis based\non one of the most successful survival models, the random survival forest. We\ncall the proposed method Federated Survival Forest (FedSurF). With a single\ncommunication round, FedSurF obtains a discriminative power comparable to\ndeep-learning-based federated models trained over hundreds of federated\niterations. Moreover, FedSurF retains all the advantages of random forests,\nnamely low computational cost and natural handling of missing values and\nincomplete datasets. These advantages are especially desirable in real-world\nfederated environments with multiple small datasets stored on devices with low\ncomputational capabilities. Numerical experiments compare FedSurF with\nstate-of-the-art survival models in federated networks, showing how FedSurF\noutperforms deep-learning-based federated algorithms in realistic environments\nwith non-identically distributed data.\n","authors":["Alberto Archetti","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2302.02807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05628v2","updated":"2023-08-07T07:41:47Z","published":"2023-07-11T06:30:43Z","title":"DNAGPT: A Generalized Pre-trained Tool for Versatile DNA Sequence\n  Analysis Tasks","summary":"  GPT has been proven to be capable of extracting general information from\nlanguage sequences, thereby benefiting all downstream tasks. This motivates us\nto use pre-trained models to explore the hidden inherent information in DNA\nsequences. However, data and task requirements in DNA sequence analyses are\ntasked in different formats such as generation, prediction and regression, and\nare complexity and involve different modalities, such as nucleotides sequences\nand, expression levels, etc. Existing BERT-based models are mostly for\ngeneration tasks and use sequence data as input and output, thus cannot easily\nhandle various DNA analysis tasks in one single model. Herein, we propose a\ngeneralized DNA pre-training DNA model, DNAGPT, that was trained on over 200\nbillion base pairs from all the mammals. We enhance the classic GPT model by\nadding binary classification task (DNA sequence order) and numerical regression\ntask (guanine-cytosine content prediction) in the pre-training period and\nenhancing the architecture with corresponding embedding layers and encoding\nheads. We also design a comprehensive token language to encode sequence, number\nand task related information in the same token space. Therefore, DNAGPT can\nhandle versatile DNA analysis tasks and simultaneously process handle both\nsequence and numerical data. We have evaluated our model on genomic signals and\nregions recognition, pseudo genomes generation and mRNA abudance regression\ntasks. We demonstrate that benefiting from pre-training, DNAGPT can shows\nsuperior performance than the existing models specially designed for various\ndownstreams tasks.\n","authors":["Daoan Zhang","Weitong Zhang","Bing He","Yu Zhao","Jianguo Zhang","Chenchen Qin","Jianhua Yao"],"pdf_url":"https://arxiv.org/pdf/2307.05628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03363v1","updated":"2023-08-07T07:37:26Z","published":"2023-08-07T07:37:26Z","title":"A reading survey on adversarial machine learning: Adversarial attacks\n  and their understanding","summary":"  Deep Learning has empowered us to train neural networks for complex data with\nhigh performance. However, with the growing research, several vulnerabilities\nin neural networks have been exposed. A particular branch of research,\nAdversarial Machine Learning, exploits and understands some of the\nvulnerabilities that cause the neural networks to misclassify for near original\ninput. A class of algorithms called adversarial attacks is proposed to make the\nneural networks misclassify for various tasks in different domains. With the\nextensive and growing research in adversarial attacks, it is crucial to\nunderstand the classification of adversarial attacks. This will help us\nunderstand the vulnerabilities in a systematic order and help us to mitigate\nthe effects of adversarial attacks. This article provides a survey of existing\nadversarial attacks and their understanding based on different perspectives. We\nalso provide a brief overview of existing adversarial defences and their\nlimitations in mitigating the effect of adversarial attacks. Further, we\nconclude with a discussion on the future research directions in the field of\nadversarial machine learning.\n","authors":["Shashank Kotyan"],"pdf_url":"https://arxiv.org/pdf/2308.03363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11312v3","updated":"2023-08-07T07:20:51Z","published":"2022-07-22T19:38:25Z","title":"HybMT: Hybrid Meta-Predictor based ML Algorithm for Fast Test Vector\n  Generation","summary":"  ML models are increasingly being used to increase the test coverage and\ndecrease the overall testing time. This field is still in its nascent stage and\nup till now there were no algorithms that could match or outperform commercial\ntools in terms of speed and accuracy for large circuits. We propose an ATPG\nalgorithm HybMT in this paper that finally breaks this barrier. Like sister\nmethods, we augment the classical PODEM algorithm that uses recursive\nbacktracking. We design a custom 2-level predictor that predicts the input net\nof a logic gate whose value needs to be set to ensure that the output is a\ngiven value (0 or 1). Our predictor chooses the output from among two\nfirst-level predictors, where the most effective one is a bespoke neural\nnetwork and the other is an SVM regressor. As compared to a popular,\nstate-of-the-art commercial ATPG tool, HybMT shows an overall reduction of\n56.6% in the CPU time without compromising on the fault coverage for the EPFL\nbenchmark circuits. HybMT also shows a speedup of 126.4% over the best ML-based\nalgorithm while obtaining an equal or better fault coverage for the EPFL\nbenchmark circuits.\n","authors":["Shruti Pandey"," Jayadeva","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2207.11312v3.pdf","comment":"6 pages, 5 figures and 5 tables. Changes from the previous version:\n  We modified our novel neural network model \"HybNN\" with a skip connection and\n  found a significant improvement in the fault coverage and runtime of our\n  HybMT-based PODEM algorithm. We train on the smaller ISCAS'85 circuits,\n  report the results for the EPFL benchmark circuits (most recent and up to 70X\n  large)"},{"id":"http://arxiv.org/abs/2303.01254v3","updated":"2023-08-07T07:07:25Z","published":"2023-02-13T10:33:21Z","title":"Privacy-Preserving Tree-Based Inference with TFHE","summary":"  Privacy enhancing technologies (PETs) have been proposed as a way to protect\nthe privacy of data while still allowing for data analysis. In this work, we\nfocus on Fully Homomorphic Encryption (FHE), a powerful tool that allows for\narbitrary computations to be performed on encrypted data. FHE has received lots\nof attention in the past few years and has reached realistic execution times\nand correctness.\n  More precisely, we explain in this paper how we apply FHE to tree-based\nmodels and get state-of-the-art solutions over encrypted tabular data. We show\nthat our method is applicable to a wide range of tree-based models, including\ndecision trees, random forests, and gradient boosted trees, and has been\nimplemented within the Concrete-ML library, which is open-source at\nhttps://github.com/zama-ai/concrete-ml. With a selected set of use-cases, we\ndemonstrate that our FHE version is very close to the unprotected version in\nterms of accuracy.\n","authors":["Jordan Frery","Andrei Stoian","Roman Bredehoft","Luis Montero","Celia Kherfallah","Benoit Chevallier-Mames","Arthur Meyre"],"pdf_url":"https://arxiv.org/pdf/2303.01254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10510v2","updated":"2023-08-07T06:40:13Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v2.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.03337v1","updated":"2023-08-07T06:38:59Z","published":"2023-08-07T06:38:59Z","title":"Solving Falkner-Skan type equations via Legendre and Chebyshev Neural\n  Blocks","summary":"  In this paper, a new deep-learning architecture for solving the non-linear\nFalkner-Skan equation is proposed. Using Legendre and Chebyshev neural blocks,\nthis approach shows how orthogonal polynomials can be used in neural networks\nto increase the approximation capability of artificial neural networks. In\naddition, utilizing the mathematical properties of these functions, we overcome\nthe computational complexity of the backpropagation algorithm by using the\noperational matrices of the derivative. The efficiency of the proposed method\nis carried out by simulating various configurations of the Falkner-Skan\nequation.\n","authors":["Alireza Afzal Aghaei","Kourosh Parand","Ali Nikkhah","Shakila Jaberi"],"pdf_url":"https://arxiv.org/pdf/2308.03337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03960v2","updated":"2023-08-07T06:35:25Z","published":"2023-05-06T07:06:47Z","title":"Beyond Rule-based Named Entity Recognition and Relation Extraction for\n  Process Model Generation from Natural Language Text","summary":"  Process-aware information systems offer extensive advantages to companies,\nfacilitating planning, operations, and optimization of day-to-day business\nactivities. However, the time-consuming but required step of designing formal\nbusiness process models often hampers the potential of these systems. To\novercome this challenge, automated generation of business process models from\nnatural language text has emerged as a promising approach to expedite this\nstep. Generally two crucial subtasks have to be solved: extracting\nprocess-relevant information from natural language and creating the actual\nmodel. Approaches towards the first subtask are rule based methods, highly\noptimized for specific domains, but hard to adapt to related applications. To\nsolve this issue, we present an extension to an existing pipeline, to make it\nentirely data driven. We demonstrate the competitiveness of our improved\npipeline, which not only eliminates the substantial overhead associated with\nfeature engineering and rule definition, but also enables adaptation to\ndifferent datasets, entity and relation types, and new domains. Additionally,\nthe largest available dataset (PET) for the first subtask, contains no\ninformation about linguistic references between mentions of entities in the\nprocess description. Yet, the resolution of these mentions into a single visual\nelement is essential for high quality process models. We propose an extension\nto the PET dataset that incorporates information about linguistic references\nand a corresponding method for resolving them. Finally, we provide a detailed\nanalysis of the inherent challenges in the dataset at hand.\n","authors":["Julian Neuberger","Lars Ackermann","Stefan Jablonski"],"pdf_url":"https://arxiv.org/pdf/2305.03960v2.pdf","comment":"Currently under review for CoopIS23"},{"id":"http://arxiv.org/abs/2305.18462v2","updated":"2023-08-07T06:32:56Z","published":"2023-05-29T07:06:03Z","title":"Membership Inference Attacks against Language Models via Neighbourhood\n  Comparison","summary":"  Membership Inference attacks (MIAs) aim to predict whether a data sample was\npresent in the training data of a machine learning model or not, and are widely\nused for assessing the privacy risks of language models. Most existing attacks\nrely on the observation that models tend to assign higher probabilities to\ntheir training samples than non-training points. However, simple thresholding\nof the model score in isolation tends to lead to high false-positive rates as\nit does not account for the intrinsic complexity of a sample. Recent work has\ndemonstrated that reference-based attacks which compare model scores to those\nobtained from a reference model trained on similar data can substantially\nimprove the performance of MIAs. However, in order to train reference models,\nattacks of this kind make the strong and arguably unrealistic assumption that\nan adversary has access to samples closely resembling the original training\ndata. Therefore, we investigate their performance in more realistic scenarios\nand find that they are highly fragile in relation to the data distribution used\nto train reference models. To investigate whether this fragility provides a\nlayer of safety, we propose and evaluate neighbourhood attacks, which compare\nmodel scores for a given sample to scores of synthetically generated neighbour\ntexts and therefore eliminate the need for access to the training data\ndistribution. We show that, in addition to being competitive with\nreference-based attacks that have perfect knowledge about the training data\ndistribution, our attack clearly outperforms existing reference-free attacks as\nwell as reference-based attacks with imperfect knowledge, which demonstrates\nthe need for a reevaluation of the threat model of adversarial attacks.\n","authors":["Justus Mattern","Fatemehsadat Mireshghallah","Zhijing Jin","Bernhard Schölkopf","Mrinmaya Sachan","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.18462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03330v1","updated":"2023-08-07T06:23:24Z","published":"2023-08-07T06:23:24Z","title":"Expediting Neural Network Verification via Network Reduction","summary":"  A wide range of verification methods have been proposed to verify the safety\nproperties of deep neural networks ensuring that the networks function\ncorrectly in critical applications. However, many well-known verification tools\nstill struggle with complicated network architectures and large network sizes.\nIn this work, we propose a network reduction technique as a pre-processing\nmethod prior to verification. The proposed method reduces neural networks via\neliminating stable ReLU neurons, and transforming them into a sequential neural\nnetwork consisting of ReLU and Affine layers which can be handled by the most\nverification tools. We instantiate the reduction technique on the\nstate-of-the-art complete and incomplete verification tools, including\nalpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of\nbenchmarks indicate that the proposed technique can significantly reduce neural\nnetworks and speed up existing verification tools. Furthermore, the experiment\nresults also show that network reduction can improve the availability of\nexisting verification tools on many networks by reducing them into sequential\nneural networks.\n","authors":["Yuyi Zhong","Ruiwei Wang","Siau-Cheng Khoo"],"pdf_url":"https://arxiv.org/pdf/2308.03330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07912v2","updated":"2023-08-07T06:20:31Z","published":"2023-01-19T06:46:36Z","title":"Interval Reachability of Nonlinear Dynamical Systems with Neural Network\n  Controllers","summary":"  This paper proposes a computationally efficient framework, based on interval\nanalysis, for rigorous verification of nonlinear continuous-time dynamical\nsystems with neural network controllers. Given a neural network, we use an\nexisting verification algorithm to construct inclusion functions for its\ninput-output behavior. Inspired by mixed monotone theory, we embed the\nclosed-loop dynamics into a larger system using an inclusion function of the\nneural network and a decomposition function of the open-loop system. This\nembedding provides a scalable approach for safety analysis of the neural\ncontrol loop while preserving the nonlinear structure of the system.\n  We show that one can efficiently compute hyper-rectangular\nover-approximations of the reachable sets using a single trajectory of the\nembedding system. We design an algorithm to leverage this computational\nadvantage through partitioning strategies, improving our reachable set\nestimates while balancing its runtime with tunable parameters. We demonstrate\nthe performance of this algorithm through two case studies. First, we\ndemonstrate this method's strength in complex nonlinear environments. Then, we\nshow that our approach matches the performance of the state-of-the art\nverification algorithm for linear discretized systems.\n","authors":["Saber Jafarpour","Akash Harapanahalli","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2301.07912v2.pdf","comment":"Extended L4DC version with proofs"},{"id":"http://arxiv.org/abs/2308.03321v1","updated":"2023-08-07T06:08:51Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework","summary":"  The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2308.03320v1","updated":"2023-08-07T06:07:04Z","published":"2023-08-07T06:07:04Z","title":"Binary Federated Learning with Client-Level Differential Privacy","summary":"  Federated learning (FL) is a privacy-preserving collaborative learning\nframework, and differential privacy can be applied to further enhance its\nprivacy protection. Existing FL systems typically adopt Federated Average\n(FedAvg) as the training algorithm and implement differential privacy with a\nGaussian mechanism. However, the inherent privacy-utility trade-off in these\nsystems severely degrades the training performance if a tight privacy budget is\nenforced. Besides, the Gaussian mechanism requires model weights to be of\nhigh-precision. To improve communication efficiency and achieve a better\nprivacy-utility trade-off, we propose a communication-efficient FL training\nalgorithm with differential privacy guarantee. Specifically, we propose to\nadopt binary neural networks (BNNs) and introduce discrete noise in the FL\nsetting. Binary model parameters are uploaded for higher communication\nefficiency and discrete noise is added to achieve the client-level differential\nprivacy protection. The achieved performance guarantee is rigorously proved,\nand it is shown to depend on the level of discrete noise. Experimental results\nbased on MNIST and Fashion-MNIST datasets will demonstrate that the proposed\ntraining algorithm achieves client-level privacy protection with performance\ngain while enjoying the benefits of low communication overhead from binary\nmodel updates.\n","authors":["Lumin Liu","Jun Zhang","Shenghui Song","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2308.03320v1.pdf","comment":"6 pages, 6 figures, accepted by IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2308.03317v1","updated":"2023-08-07T06:01:50Z","published":"2023-08-07T06:01:50Z","title":"HomOpt: A Homotopy-Based Hyperparameter Optimization Method","summary":"  Machine learning has achieved remarkable success over the past couple of\ndecades, often attributed to a combination of algorithmic innovations and the\navailability of high-quality data available at scale. However, a third critical\ncomponent is the fine-tuning of hyperparameters, which plays a pivotal role in\nachieving optimal model performance. Despite its significance, hyperparameter\noptimization (HPO) remains a challenging task for several reasons. Many HPO\ntechniques rely on naive search methods or assume that the loss function is\nsmooth and continuous, which may not always be the case. Traditional methods,\nlike grid search and Bayesian optimization, often struggle to quickly adapt and\nefficiently search the loss landscape. Grid search is computationally\nexpensive, while Bayesian optimization can be slow to prime. Since the search\nspace for HPO is frequently high-dimensional and non-convex, it is often\nchallenging to efficiently find a global minimum. Moreover, optimal\nhyperparameters can be sensitive to the specific dataset or task, further\ncomplicating the search process. To address these issues, we propose a new\nhyperparameter optimization method, HomOpt, using a data-driven approach based\non a generalized additive model (GAM) surrogate combined with homotopy\noptimization. This strategy augments established optimization methodologies to\nboost the performance and effectiveness of any given method with faster\nconvergence to the optimum on continuous, discrete, and categorical domain\nspaces. We compare the effectiveness of HomOpt applied to multiple optimization\ntechniques (e.g., Random Search, TPE, Bayes, and SMAC) showing improved\nobjective performance on many standardized machine learning benchmarks and\nchallenging open-set recognition tasks.\n","authors":["Sophia J. Abraham","Kehelwala D. G. Maduranga","Jeffery Kinnison","Zachariah Carmichael","Jonathan D. Hauenstein","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2308.03317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03316v1","updated":"2023-08-07T05:58:40Z","published":"2023-08-07T05:58:40Z","title":"Deep Q-Network for Stochastic Process Environments","summary":"  Reinforcement learning is a powerful approach for training an optimal policy\nto solve complex problems in a given system. This project aims to demonstrate\nthe application of reinforcement learning in stochastic process environments\nwith missing information, using Flappy Bird and a newly developed stock trading\nenvironment as case studies. We evaluate various structures of Deep Q-learning\nnetworks and identify the most suitable variant for the stochastic process\nenvironment. Additionally, we discuss the current challenges and propose\npotential improvements for further work in environment-building and\nreinforcement learning techniques.\n","authors":["Kuangheng He"],"pdf_url":"https://arxiv.org/pdf/2308.03316v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.03724v2","updated":"2023-08-07T05:52:36Z","published":"2023-03-07T08:16:46Z","title":"Learning Bipedal Walking for Humanoids with Current Feedback","summary":"  Recent advances in deep reinforcement learning (RL) based techniques combined\nwith training in simulation have offered a new approach to developing robust\ncontrollers for legged robots. However, the application of such approaches to\nreal hardware has largely been limited to quadrupedal robots with direct-drive\nactuators and light-weight bipedal robots with low gear-ratio transmission\nsystems. Application to real, life-sized humanoid robots has been less common\narguably due to a large sim2real gap. In this paper, we present an approach for\neffectively overcoming the sim2real gap issue for humanoid robots arising from\ninaccurate torque-tracking at the actuator level. Our key idea is to utilize\nthe current feedback from the actuators on the real robot, after training the\npolicy in a simulation environment artificially degraded with poor\ntorque-tracking. Our approach successfully trains a unified, end-to-end policy\nin simulation that can be deployed on a real HRP-5P humanoid robot to achieve\nbipedal locomotion. Through ablations, we also show that a feedforward policy\narchitecture combined with targeted dynamics randomization is sufficient for\nzero-shot sim2real success, thus eliminating the need for computationally\nexpensive, memory-based network architectures. Finally, we validate the\nrobustness of the proposed RL policy by comparing its performance against a\nconventional model-based controller for walking on uneven terrain with the real\nrobot.\n","authors":["Rohan Pratap Singh","Zhaoming Xie","Pierre Gergondet","Fumio Kanehiro"],"pdf_url":"https://arxiv.org/pdf/2303.03724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03312v1","updated":"2023-08-07T05:40:58Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":"  Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n  Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03306v1","updated":"2023-08-07T05:22:33Z","published":"2023-08-07T05:22:33Z","title":"Implicit Graph Neural Diffusion Based on Constrained Dirichlet Energy\n  Minimization","summary":"  Implicit graph neural networks (GNNs) have emerged as a potential approach to\nenable GNNs to capture long-range dependencies effectively. However, poorly\ndesigned implicit GNN layers can experience over-smoothing or may have limited\nadaptability to learn data geometry, potentially hindering their performance in\ngraph learning problems. To address these issues, we introduce a geometric\nframework to design implicit graph diffusion layers based on a parameterized\ngraph Laplacian operator. Our framework allows learning the geometry of vertex\nand edge spaces, as well as the graph gradient operator from data. We further\nshow how implicit GNN layers can be viewed as the fixed-point solution of a\nDirichlet energy minimization problem and give conditions under which it may\nsuffer from over-smoothing. To overcome the over-smoothing problem, we design\nour implicit graph diffusion layer as the solution of a Dirichlet energy\nminimization problem with constraints on vertex features, enabling it to trade\noff smoothing with the preservation of node feature information. With an\nappropriate hyperparameter set to be larger than the largest eigenvalue of the\nparameterized graph Laplacian, our framework guarantees a unique equilibrium\nand quick convergence. Our models demonstrate better performance than leading\nimplicit and explicit GNNs on benchmark datasets for node and graph\nclassification tasks, with substantial accuracy improvements observed for some\ndatasets.\n","authors":["Guoji Fu","Mohammed Haroon Dupty","Yanfei Dong","Lee Wee Sun"],"pdf_url":"https://arxiv.org/pdf/2308.03306v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2308.03300v1","updated":"2023-08-07T05:05:49Z","published":"2023-08-07T05:05:49Z","title":"Do You Remember? Overcoming Catastrophic Forgetting for Fake Audio\n  Detection","summary":"  Current fake audio detection algorithms have achieved promising performances\non most datasets. However, their performance may be significantly degraded when\ndealing with audio of a different dataset. The orthogonal weight modification\nto overcome catastrophic forgetting does not consider the similarity of genuine\naudio across different datasets. To overcome this limitation, we propose a\ncontinual learning algorithm for fake audio detection to overcome catastrophic\nforgetting, called Regularized Adaptive Weight Modification (RAWM). When\nfine-tuning a detection network, our approach adaptively computes the direction\nof weight modification according to the ratio of genuine utterances and fake\nutterances. The adaptive modification direction ensures the network can\neffectively detect fake audio on the new dataset while preserving its knowledge\nof old model, thus mitigating catastrophic forgetting. In addition, genuine\naudio collected from quite different acoustic conditions may skew their feature\ndistribution, so we introduce a regularization constraint to force the network\nto remember the old distribution in this regard. Our method can easily be\ngeneralized to related fields, like speech emotion recognition. We also\nevaluate our approach across multiple datasets and obtain a significant\nperformance improvement on cross-dataset experiments.\n","authors":["Xiaohui Zhang","Jiangyan Yi","Jianhua Tao","Chenglong Wang","Chuyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03300v1.pdf","comment":"40th Internation Conference on Machine Learning (ICML 2023)"},{"id":"http://arxiv.org/abs/2308.03296v1","updated":"2023-08-07T04:47:42Z","published":"2023-08-07T04:47:42Z","title":"Studying Large Language Model Generalization with Influence Functions","summary":"  When trying to gain better visibility into a machine learning model in order\nto understand and mitigate the associated risks, a potentially valuable source\nof evidence is: which training examples most contribute to a given behavior?\nInfluence functions aim to answer a counterfactual: how would the model's\nparameters (and hence its outputs) change if a given sequence were added to the\ntraining set? While influence functions have produced insights for small\nmodels, they are difficult to scale to large language models (LLMs) due to the\ndifficulty of computing an inverse-Hessian-vector product (IHVP). We use the\nEigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)\napproximation to scale influence functions up to LLMs with up to 52 billion\nparameters. In our experiments, EK-FAC achieves similar accuracy to traditional\ninfluence function estimators despite the IHVP computation being orders of\nmagnitude faster. We investigate two algorithmic techniques to reduce the cost\nof computing gradients of candidate training sequences: TF-IDF filtering and\nquery batching. We use influence functions to investigate the generalization\npatterns of LLMs, including the sparsity of the influence patterns, increasing\nabstraction with scale, math and programming abilities, cross-lingual\ngeneralization, and role-playing behavior. Despite many apparently\nsophisticated forms of generalization, we identify a surprising limitation:\ninfluences decay to near-zero when the order of key phrases is flipped.\nOverall, influence functions give us a powerful new tool for studying the\ngeneralization properties of LLMs.\n","authors":["Roger Grosse","Juhan Bae","Cem Anil","Nelson Elhage","Alex Tamkin","Amirhossein Tajdini","Benoit Steiner","Dustin Li","Esin Durmus","Ethan Perez","Evan Hubinger","Kamilė Lukošiūtė","Karina Nguyen","Nicholas Joseph","Sam McCandlish","Jared Kaplan","Samuel R. Bowman"],"pdf_url":"https://arxiv.org/pdf/2308.03296v1.pdf","comment":"119 pages, 47 figures, 22 tables"},{"id":"http://arxiv.org/abs/2308.01814v2","updated":"2023-08-07T04:47:32Z","published":"2023-08-03T15:22:51Z","title":"Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit","summary":"  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in\nwide neural networks trained by adaptive optimizers like Adam? Here we show:\nThe same dichotomy between feature learning and kernel behaviors (as in SGD)\nholds for general optimizers as well, including Adam -- albeit with a nonlinear\nnotion of \"kernel.\" We derive the corresponding \"neural tangent\" and \"maximal\nupdate\" limits for any architecture. Two foundational advances underlie the\nabove results: 1) A new Tensor Program language, NEXORT, that can express how\nadaptive optimizers process gradients into updates. 2) The introduction of\nbra-ket notation to drastically simplify expressions and calculations in Tensor\nPrograms. This work summarizes and generalizes all previous results in the\nTensor Programs series of papers.\n","authors":["Greg Yang","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2308.01814v2.pdf","comment":"This is the complete version of \"Adaptive Optimization in the\n  Infinite-Width Limit\" in ICLR 2023,\n  https://openreview.net/forum?id=zgVDqw9ZUES"},{"id":"http://arxiv.org/abs/2308.03295v1","updated":"2023-08-07T04:44:12Z","published":"2023-08-07T04:44:12Z","title":"DOMINO: Domain-invariant Hyperdimensional Classification for\n  Multi-Sensor Time Series Data","summary":"  With the rapid evolution of the Internet of Things, many real-world\napplications utilize heterogeneously connected sensors to capture time-series\ninformation. Edge-based machine learning (ML) methodologies are often employed\nto analyze locally collected data. However, a fundamental issue across\ndata-driven ML approaches is distribution shift. It occurs when a model is\ndeployed on a data distribution different from what it was trained on, and can\nsubstantially degrade model performance. Additionally, increasingly\nsophisticated deep neural networks (DNNs) have been proposed to capture spatial\nand temporal dependencies in multi-sensor time series data, requiring intensive\ncomputational resources beyond the capacity of today's edge devices. While\nbrain-inspired hyperdimensional computing (HDC) has been introduced as a\nlightweight solution for edge-based learning, existing HDCs are also vulnerable\nto the distribution shift challenge. In this paper, we propose DOMINO, a novel\nHDC learning framework addressing the distribution shift problem in noisy\nmulti-sensor time-series data. DOMINO leverages efficient and parallel matrix\noperations on high-dimensional space to dynamically identify and filter out\ndomain-variant dimensions. Our evaluation on a wide range of multi-sensor time\nseries classification tasks shows that DOMINO achieves on average 2.04% higher\naccuracy than state-of-the-art (SOTA) DNN-based domain generalization\ntechniques, and delivers 7.83x faster training and 26.94x faster inference.\nMore importantly, DOMINO performs notably better when learning from partially\nlabeled and highly imbalanced data, providing 10.93x higher robustness against\nhardware noises than SOTA DNNs.\n","authors":["Junyao Wang","Luke Chen","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2308.03295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02360v2","updated":"2023-08-07T04:32:21Z","published":"2023-08-04T14:52:22Z","title":"Intensity-free Integral-based Learning of Marked Temporal Point\n  Processes","summary":"  In the marked temporal point processes (MTPP), a core problem is to\nparameterize the conditional joint PDF (probability distribution function)\n$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.\nThe majority of existing studies predefine intensity functions. Their utility\nis challenged by specifying the intensity function's proper form, which is\ncritical to balance expressiveness and processing efficiency. Recently, there\nare studies moving away from predefining the intensity function -- one models\n$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point\nprocesses (TPPs), which do not consider marks. This study aims to develop\nhigh-fidelity $p^*(m,t)$ for discrete events where the event marks are either\ncategorical or numeric in a multi-dimensional continuous space. We propose a\nsolution framework IFIB (\\underline{I}ntensity-\\underline{f}ree\n\\underline{I}ntegral-\\underline{b}ased process) that models conditional joint\nPDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies\nthe process to compel the essential mathematical restrictions. We show the\ndesired properties of IFIB and the superior experimental results of IFIB on\nreal-world and synthetic datasets. The code is available at\n\\url{https://github.com/StepinSilence/IFIB}.\n","authors":["Sishun Liu","Ke Deng","Xiuzhen Zhang","Yongli Ren"],"pdf_url":"https://arxiv.org/pdf/2308.02360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03291v1","updated":"2023-08-07T04:20:38Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":"  The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n  SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03290v1","updated":"2023-08-07T04:17:19Z","published":"2023-08-07T04:17:19Z","title":"FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization\n  Search","summary":"  Quantization has become a mainstream compression technique for reducing model\nsize, computational requirements, and energy consumption for modern deep neural\nnetworks (DNNs). With the improved numerical support in recent hardware,\nincluding multiple variants of integer and floating point, mixed-precision\nquantization has become necessary to achieve high-quality results with low\nmodel cost. Prior mixed-precision quantization methods have performed a\npost-training quantization search, which compromises on accuracy, or a\ndifferentiable quantization search, which leads to high memory usage from\nbranching. Therefore, we propose the first one-shot mixed-precision\nquantization search that eliminates the need for retraining in both integer and\nlow-precision floating point models. We evaluate our floating-point and integer\nquantization search (FLIQS) on multiple convolutional networks and vision\ntransformer models to discover Pareto-optimal models. Our approach discovers\nmodels that improve upon uniform precision, manual mixed-precision, and recent\ninteger quantization search methods. With the proposed integer quantization\nsearch, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and\nResNet-50 by 0.90% points with equivalent model cost over previous methods.\nAdditionally, for the first time, we explore a novel mixed-precision\nfloating-point search and improve MobileNetV2 by up to 0.98% points compared to\nprior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously\nsearch a joint quantization and neural architecture space and improve the\nImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2\nsearch space.\n","authors":["Jordan Dotzel","Gang Wu","Andrew Li","Muhammad Umar","Yun Ni","Mohamed S. Abdelfattah","Zhiru Zhang","Liqun Cheng","Martin G. Dixon","Norman P. Jouppi","Quoc V. Le","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.03290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15306v2","updated":"2023-08-07T04:07:06Z","published":"2022-06-30T14:24:32Z","title":"Transfer Learning with Deep Tabular Models","summary":"  Recent work on deep learning for tabular data demonstrates the strong\nperformance of deep tabular models, often bridging the gap between gradient\nboosted decision trees and neural networks. Accuracy aside, a major advantage\nof neural models is that they learn reusable features and are easily fine-tuned\nin new domains. This property is often exploited in computer vision and natural\nlanguage applications, where transfer learning is indispensable when\ntask-specific training data is scarce. In this work, we demonstrate that\nupstream data gives tabular neural networks a decisive advantage over widely\nused GBDT models. We propose a realistic medical diagnosis benchmark for\ntabular transfer learning, and we present a how-to guide for using upstream\ndata to boost performance with a variety of tabular neural network\narchitectures. Finally, we propose a pseudo-feature method for cases where the\nupstream and downstream feature sets differ, a tabular-specific problem\nwidespread in real-world applications. Our code is available at\nhttps://github.com/LevinRoman/tabular-transfer-learning .\n","authors":["Roman Levin","Valeriia Cherepanova","Avi Schwarzschild","Arpit Bansal","C. Bayan Bruss","Tom Goldstein","Andrew Gordon Wilson","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2206.15306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03283v1","updated":"2023-08-07T04:00:13Z","published":"2023-08-07T04:00:13Z","title":"High-rate discretely-modulated continuous-variable quantum key\n  distribution using quantum machine learning","summary":"  We propose a high-rate scheme for discretely-modulated continuous-variable\nquantum key distribution (DM CVQKD) using quantum machine learning\ntechnologies, which divides the whole CVQKD system into three parts, i.e., the\ninitialization part that is used for training and estimating quantum\nclassifier, the prediction part that is used for generating highly correlated\nraw keys, and the data-postprocessing part that generates the final secret key\nstring shared by Alice and Bob. To this end, a low-complexity quantum k-nearest\nneighbor (QkNN) classifier is designed for predicting the lossy\ndiscretely-modulated coherent states (DMCSs) at Bob's side. The performance of\nthe proposed QkNN-based CVQKD especially in terms of machine learning metrics\nand complexity is analyzed, and its theoretical security is proved by using\nsemi-definite program (SDP) method. Numerical simulation shows that the secret\nkey rate of our proposed scheme is explicitly superior to the existing DM CVQKD\nprotocols, and it can be further enhanced with the increase of modulation\nvariance.\n","authors":["Qin Liao","Jieyu Liu","Anqi Huang","Lei Huang","Zhuoying Fei","Xiquan Fu"],"pdf_url":"https://arxiv.org/pdf/2308.03283v1.pdf","comment":"18 pages, 17 figures"},{"id":"http://arxiv.org/abs/2212.09201v2","updated":"2023-08-07T03:33:28Z","published":"2022-12-19T00:42:21Z","title":"Spectral Regularized Kernel Two-Sample Tests","summary":"  Over the last decade, an approach that has gained a lot of popularity to\ntackle non-parametric testing problems on general (i.e., non-Euclidean) domains\nis based on the notion of reproducing kernel Hilbert space (RKHS) embedding of\nprobability distributions. The main goal of our work is to understand the\noptimality of two-sample tests constructed based on this approach. First, we\nshow that the popular MMD (maximum mean discrepancy) two-sample test is not\noptimal in terms of the separation boundary measured in Hellinger distance.\nSecond, we propose a modification to the MMD test based on spectral\nregularization by taking into account the covariance information (which is not\ncaptured by the MMD test) and prove the proposed test to be minimax optimal\nwith a smaller separation boundary than that achieved by the MMD test. Third,\nwe propose an adaptive version of the above test which involves a data-driven\nstrategy to choose the regularization parameter and show the adaptive test to\nbe almost minimax optimal up to a logarithmic factor. Moreover, our results\nhold for the permutation variant of the test where the test threshold is chosen\nelegantly through the permutation of the samples. Through numerical experiments\non synthetic and real-world data, we demonstrate the superior performance of\nthe proposed test in comparison to the MMD test.\n","authors":["Omar Hagrass","Bharath K. Sriperumbudur","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2212.09201v2.pdf","comment":"63 pages"},{"id":"http://arxiv.org/abs/2308.03274v1","updated":"2023-08-07T03:32:39Z","published":"2023-08-07T03:32:39Z","title":"DSformer: A Double Sampling Transformer for Multivariate Time Series\n  Long-term Prediction","summary":"  Multivariate time series long-term prediction, which aims to predict the\nchange of data in a long time, can provide references for decision-making.\nAlthough transformer-based models have made progress in this field, they\nusually do not make full use of three features of multivariate time series:\nglobal information, local information, and variables correlation. To\neffectively mine the above three features and establish a high-precision\nprediction model, we propose a double sampling transformer (DSformer), which\nconsists of the double sampling (DS) block and the temporal variable attention\n(TVA) block. Firstly, the DS block employs down sampling and piecewise sampling\nto transform the original series into feature vectors that focus on global\ninformation and local information respectively. Then, TVA block uses temporal\nattention and variable attention to mine these feature vectors from different\ndimensions and extract key information. Finally, based on a parallel structure,\nDSformer uses multiple TVA blocks to mine and integrate different features\nobtained from DS blocks respectively. The integrated feature information is\npassed to the generative decoder based on a multi-layer perceptron to realize\nmultivariate time series long-term prediction. Experimental results on nine\nreal-world datasets show that DSformer can outperform eight existing baselines.\n","authors":["Chengqing Yu","Fei Wang","Zezhi Shao","Tao Sun","Lin Wu","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03274v1.pdf","comment":"Accepted by CIKM 2023 (FULL paper)"},{"id":"http://arxiv.org/abs/2103.00676v2","updated":"2023-08-07T03:25:37Z","published":"2021-03-01T01:00:09Z","title":"Token-Modification Adversarial Attacks for Natural Language Processing:\n  A Survey","summary":"  There are now many adversarial attacks for natural language processing\nsystems. Of these, a vast majority achieve success by modifying individual\ndocument tokens, which we call here a token-modification attack. Each\ntoken-modification attack is defined by a specific combination of fundamental\ncomponents, such as a constraint on the adversary or a particular search\nalgorithm. Motivated by this observation, we survey existing token-modification\nattacks and extract the components of each. We use an attack-independent\nframework to structure our survey which results in an effective categorisation\nof the field and an easy comparison of components. This survey aims to guide\nnew researchers to this field and spark further research into individual attack\ncomponents.\n","authors":["Tom Roth","Yansong Gao","Alsharif Abuadbba","Surya Nepal","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2103.00676v2.pdf","comment":"Version 2: updated"},{"id":"http://arxiv.org/abs/2308.03271v1","updated":"2023-08-07T03:23:46Z","published":"2023-08-07T03:23:46Z","title":"Local Structure-aware Graph Contrastive Representation Learning","summary":"  Traditional Graph Neural Network (GNN), as a graph representation learning\nmethod, is constrained by label information. However, Graph Contrastive\nLearning (GCL) methods, which tackle the label problem effectively, mainly\nfocus on the feature information of the global graph or small subgraph\nstructure (e.g., the first-order neighborhood). In the paper, we propose a\nLocal Structure-aware Graph Contrastive representation Learning method (LS-GCL)\nto model the structural information of nodes from multiple views. Specifically,\nwe construct the semantic subgraphs that are not limited to the first-order\nneighbors. For the local view, the semantic subgraph of each target node is\ninput into a shared GNN encoder to obtain the target node embeddings at the\nsubgraph-level. Then, we use a pooling function to generate the subgraph-level\ngraph embeddings. For the global view, considering the original graph preserves\nindispensable semantic information of nodes, we leverage the shared GNN encoder\nto learn the target node embeddings at the global graph-level. The proposed\nLS-GCL model is optimized to maximize the common information among similar\ninstances at three various perspectives through a multi-level contrastive loss\nfunction. Experimental results on five datasets illustrate that our method\noutperforms state-of-the-art graph representation learning approaches for both\nnode classification and link prediction tasks.\n","authors":["Kai Yang","Yuan Liu","Zijuan Zhao","Peijin Ding","Wenqian Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.03271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03269v1","updated":"2023-08-07T03:19:59Z","published":"2023-08-07T03:19:59Z","title":"Simple Rule Injection for ComplEx Embeddings","summary":"  Recent works in neural knowledge graph inference attempt to combine logic\nrules with knowledge graph embeddings to benefit from prior knowledge. However,\nthey usually cannot avoid rule grounding, and injecting a diverse set of rules\nhas still not been thoroughly explored. In this work, we propose InjEx, a\nmechanism to inject multiple types of rules through simple constraints, which\ncapture definite Horn rules. To start, we theoretically prove that InjEx can\ninject such rules. Next, to demonstrate that InjEx infuses interpretable prior\nknowledge into the embedding space, we evaluate InjEx on both the knowledge\ngraph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.\nOur experimental results reveal that InjEx outperforms both baseline KGC models\nas well as specialized few-shot models while maintaining its scalability and\nefficiency.\n","authors":["Haodi Ma","Anthony Colas","Yuejie Wang","Ali Sadeghian","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02394v2","updated":"2023-08-07T03:07:59Z","published":"2023-05-03T19:29:26Z","title":"Defending against Insertion-based Textual Backdoor Attacks via\n  Attribution","summary":"  Textual backdoor attack, as a novel attack model, has been shown to be\neffective in adding a backdoor to the model during training. Defending against\nsuch backdoor attacks has become urgent and important. In this paper, we\npropose AttDef, an efficient attribution-based pipeline to defend against two\ninsertion-based poisoning attacks, BadNL and InSent. Specifically, we regard\nthe tokens with larger attribution scores as potential triggers since larger\nattribution words contribute more to the false prediction results and therefore\nare more likely to be poison triggers. Additionally, we further utilize an\nexternal pre-trained language model to distinguish whether input is poisoned or\nnot. We show that our proposed method can generalize sufficiently well in two\ncommon attack scenarios (poisoning training data and testing data), which\nconsistently improves previous methods. For instance, AttDef can successfully\nmitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%\n(3.99% up) under pre-training and post-training attack defense respectively,\nachieving the new state-of-the-art performance on prediction recovery over four\nbenchmark datasets.\n","authors":["Jiazhao Li","Zhuofeng Wu","Wei Ping","Chaowei Xiao","V. G. Vinod Vydiswaran"],"pdf_url":"https://arxiv.org/pdf/2305.02394v2.pdf","comment":"Findings of ACL 2023. Camera-ready version"},{"id":"http://arxiv.org/abs/2212.08254v2","updated":"2023-08-07T03:00:41Z","published":"2022-12-16T02:52:37Z","title":"RepQ-ViT: Scale Reparameterization for Post-Training Quantization of\n  Vision Transformers","summary":"  Post-training quantization (PTQ), which only requires a tiny dataset for\ncalibration without end-to-end retraining, is a light and practical model\ncompression technique. Recently, several PTQ schemes for vision transformers\n(ViTs) have been presented; unfortunately, they typically suffer from\nnon-trivial accuracy degradation, especially in low-bit cases. In this paper,\nwe propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale\nreparameterization, to address the above issues. RepQ-ViT decouples the\nquantization and inference processes, where the former employs complex\nquantizers and the latter employs scale-reparameterized simplified quantizers.\nThis ensures both accurate quantization and efficient inference, which\ndistinguishes it from existing approaches that sacrifice quantization\nperformance to meet the target hardware. More specifically, we focus on two\ncomponents with extreme distributions: post-LayerNorm activations with severe\ninter-channel variation and post-Softmax activations with power-law features,\nand initially apply channel-wise quantization and log$\\sqrt{2}$ quantization,\nrespectively. Then, we reparameterize the scales to hardware-friendly\nlayer-wise quantization and log2 quantization for inference, with only slight\naccuracy or computational costs. Extensive experiments are conducted on\nmultiple vision tasks with different model variants, proving that RepQ-ViT,\nwithout hyperparameters and expensive reconstruction procedures, can outperform\nexisting strong baselines and encouragingly improve the accuracy of 4-bit PTQ\nof ViTs to a usable level. Code is available at\nhttps://github.com/zkkli/RepQ-ViT.\n","authors":["Zhikai Li","Junrui Xiao","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2212.08254v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02180v2","updated":"2023-08-07T02:53:06Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zhang","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v2.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2308.03260v1","updated":"2023-08-07T02:42:21Z","published":"2023-08-07T02:42:21Z","title":"Exploring Different Time-series-Transformer (TST) Architectures: A Case\n  Study in Battery Life Prediction for Electric Vehicles (EVs)","summary":"  In recent years, battery technology for electric vehicles (EVs) has been a\nmajor focus, with a significant emphasis on developing new battery materials\nand chemistries. However, accurately predicting key battery parameters, such as\nstate-of-charge (SOC) and temperature, remains a challenge for constructing\nadvanced battery management systems (BMS). Existing battery models do not\ncomprehensively cover all parameters affecting battery performance, including\nnon-battery-related factors like ambient temperature, cabin temperature,\nelevation, and regenerative braking during EV operation. Due to the difficulty\nof incorporating these auxiliary parameters into traditional models, a\ndata-driven approach is suggested. Time-series-transformers (TSTs), leveraging\nmultiheaded attention and parallelization-friendly architecture, are explored\nalongside LSTM models. Novel TST architectures, including encoder TST + decoder\nLSTM and a hybrid TST-LSTM, are also developed and compared against existing\nmodels. A dataset comprising 72 driving trips in a BMW i3 (60 Ah) is used to\naddress battery life prediction in EVs, aiming to create accurate TST models\nthat incorporate environmental, battery, vehicle driving, and heating circuit\ndata to predict SOC and battery temperature for future time steps.\n","authors":["Niranjan Sitapure","Atharva Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2308.03260v1.pdf","comment":"13 pages and 7 figures"},{"id":"http://arxiv.org/abs/2308.03259v1","updated":"2023-08-07T02:37:02Z","published":"2023-08-07T02:37:02Z","title":"Optimal Approximation and Learning Rates for Deep Convolutional Neural\n  Networks","summary":"  This paper focuses on approximation and learning performance analysis for\ndeep convolutional neural networks with zero-padding and max-pooling. We prove\nthat, to approximate $r$-smooth function, the approximation rates of deep\nconvolutional neural networks with depth $L$ are of order $ (L^2/\\log\nL)^{-2r/d} $, which is optimal up to a logarithmic factor. Furthermore, we\ndeduce almost optimal learning rates for implementing empirical risk\nminimization over deep convolutional neural networks.\n","authors":["Shao-Bo Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03259v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2301.01470v5","updated":"2023-08-07T02:06:09Z","published":"2023-01-04T07:16:46Z","title":"Model Parameter Identification via a Hyperparameter Optimization Scheme\n  for Autonomous Racing Systems","summary":"  In this letter, we propose a model parameter identification method via a\nhyperparameter optimization scheme (MI-HPO). Our method adopts an efficient\nexplore-exploit strategy to identify the parameters of dynamic models in a\ndata-driven optimization manner. We utilize our method for model parameter\nidentification of the AV-21, a full-scaled autonomous race vehicle. We then\nincorporate the optimized parameters for the design of model-based planning and\ncontrol systems of our platform. In experiments, MI-HPO exhibits more than 13\ntimes faster convergence than traditional parameter identification methods.\nFurthermore, the parametric models learned via MI-HPO demonstrate good fitness\nto the given datasets and show generalization ability in unseen dynamic\nscenarios. We further conduct extensive field tests to validate our model-based\nsystem, demonstrating stable obstacle avoidance and high-speed driving up to\n217 km/h at the Indianapolis Motor Speedway and Las Vegas Motor Speedway. The\nsource code for our work and videos of the tests are available at\nhttps://github.com/hynkis/MI-HPO.\n","authors":["Hyunki Seong","Chanyoung Chung","David Hyunchul Shim"],"pdf_url":"https://arxiv.org/pdf/2301.01470v5.pdf","comment":"6 pages, 8 figures. Published in IEEE Control Systems Letters (L-CSS)"},{"id":"http://arxiv.org/abs/2304.06833v3","updated":"2023-08-07T01:41:25Z","published":"2023-04-13T21:54:53Z","title":"Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus\n  Sample Average Approximation: A Stochastic Dominance Perspective","summary":"  In data-driven stochastic optimization, model parameters of the underlying\ndistribution need to be estimated from data in addition to the optimization\ntask. Recent literature considers integrating the estimation and optimization\nprocesses by selecting model parameters that lead to the best empirical\nobjective performance. This integrated approach, which we call\nintegrated-estimation-optimization (IEO), can be readily shown to outperform\nsimple estimate-then-optimize (ETO) when the model is misspecified. In this\npaper, we show that a reverse behavior appears when the model class is\nwell-specified and there is sufficient data. Specifically, for a general class\nof nonlinear stochastic optimization problems, we show that simple ETO\noutperforms IEO asymptotically when the model class covers the ground truth, in\nthe strong sense of stochastic dominance of the regret. Namely, the entire\ndistribution of the regret, not only its mean or other moments, is always\nbetter for ETO compared to IEO. Our results also apply to constrained,\ncontextual optimization problems where the decision depends on observed\nfeatures. Whenever applicable, we also demonstrate how standard sample average\napproximation (SAA) performs the worst when the model class is well-specified\nin terms of regret, and best when it is misspecified. Finally, we provide\nexperimental results to support our theoretical comparisons and illustrate when\nour insights hold in finite-sample regimes and under various degrees of\nmisspecification.\n","authors":["Adam N. Elmachtoub","Henry Lam","Haofeng Zhang","Yunfan Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.06833v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03243v1","updated":"2023-08-07T01:41:21Z","published":"2023-08-07T01:41:21Z","title":"Unsupervised Adversarial Detection without Extra Model: Training Loss\n  Should Change","summary":"  Adversarial robustness poses a critical challenge in the deployment of deep\nlearning models for real-world applications. Traditional approaches to\nadversarial training and supervised detection rely on prior knowledge of attack\ntypes and access to labeled training data, which is often impractical. Existing\nunsupervised adversarial detection methods identify whether the target model\nworks properly, but they suffer from bad accuracies owing to the use of common\ncross-entropy training loss, which relies on unnecessary features and\nstrengthens adversarial attacks. We propose new training losses to reduce\nuseless features and the corresponding detection method without prior knowledge\nof adversarial attacks. The detection rate (true positive rate) against all\ngiven white-box attacks is above 93.9% except for attacks without limits\n(DF($\\infty$)), while the false positive rate is barely 2.5%. The proposed\nmethod works well in all tested attack types and the false positive rates are\neven better than the methods good at certain types.\n","authors":["Chien Cheng Chyou","Hung-Ting Su","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2308.03243v1.pdf","comment":"AdvML in ICML 2023\n  code:https://github.com/CycleBooster/Unsupervised-adversarial-detection-without-extra-model"},{"id":"http://arxiv.org/abs/2308.03239v1","updated":"2023-08-07T01:32:09Z","published":"2023-08-07T01:32:09Z","title":"Asynchronous Decentralized Q-Learning: Two Timescale Analysis By\n  Persistence","summary":"  Non-stationarity is a fundamental challenge in multi-agent reinforcement\nlearning (MARL), where agents update their behaviour as they learn. Many\ntheoretical advances in MARL avoid the challenge of non-stationarity by\ncoordinating the policy updates of agents in various ways, including\nsynchronizing times at which agents are allowed to revise their policies.\nSynchronization enables analysis of many MARL algorithms via multi-timescale\nmethods, but such synchrony is infeasible in many decentralized applications.\nIn this paper, we study an asynchronous variant of the decentralized Q-learning\nalgorithm, a recent MARL algorithm for stochastic games. We provide sufficient\nconditions under which the asynchronous algorithm drives play to equilibrium\nwith high probability. Our solution utilizes constant learning rates in the\nQ-factor update, which we show to be critical for relaxing the synchrony\nassumptions of earlier work. Our analysis also applies to asynchronous\ngeneralizations of a number of other algorithms from the regret testing\ntradition, whose performance is analyzed by multi-timescale methods that study\nMarkov chains obtained via policy update dynamics. This work extends the\napplicability of the decentralized Q-learning algorithm and its relatives to\nsettings in which parameters are selected in an independent manner, and tames\nnon-stationarity without imposing the coordination assumptions of prior work.\n","authors":["Bora Yongacoglu","Gürdal Arslan","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2308.03239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03236v1","updated":"2023-08-07T01:25:10Z","published":"2023-08-07T01:25:10Z","title":"G-Mix: A Generalized Mixup Learning Framework Towards Flat Minima","summary":"  Deep neural networks (DNNs) have demonstrated promising results in various\ncomplex tasks. However, current DNNs encounter challenges with\nover-parameterization, especially when there is limited training data\navailable. To enhance the generalization capability of DNNs, the Mixup\ntechnique has gained popularity. Nevertheless, it still produces suboptimal\noutcomes. Inspired by the successful Sharpness-Aware Minimization (SAM)\napproach, which establishes a connection between the sharpness of the training\nloss landscape and model generalization, we propose a new learning framework\ncalled Generalized-Mixup, which combines the strengths of Mixup and SAM for\ntraining DNN models. The theoretical analysis provided demonstrates how the\ndeveloped G-Mix framework enhances generalization. Additionally, to further\noptimize DNN performance with the G-Mix framework, we introduce two novel\nalgorithms: Binary G-Mix and Decomposed G-Mix. These algorithms partition the\ntraining data into two subsets based on the sharpness-sensitivity of each\nexample to address the issue of \"manifold intrusion\" in Mixup. Both theoretical\nexplanations and experimental results reveal that the proposed BG-Mix and\nDG-Mix algorithms further enhance model generalization across multiple datasets\nand models, achieving state-of-the-art performance.\n","authors":["Xingyu Li","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03236v1.pdf","comment":"19 pages, 23 figures"},{"id":"http://arxiv.org/abs/2212.12294v2","updated":"2023-08-07T01:21:19Z","published":"2022-12-23T12:51:42Z","title":"FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos","summary":"  Neural fields, also known as coordinate-based or implicit neural\nrepresentations, have shown a remarkable capability of representing,\ngenerating, and manipulating various forms of signals. For video\nrepresentations, however, mapping pixel-wise coordinates to RGB colors has\nshown relatively low compression performance and slow convergence and inference\nspeed. Frame-wise video representation, which maps a temporal coordinate to its\nentire frame, has recently emerged as an alternative method to represent\nvideos, improving compression rates and encoding speed. While promising, it has\nstill failed to reach the performance of state-of-the-art video compression\nalgorithms. In this work, we propose FFNeRV, a novel method for incorporating\nflow information into frame-wise representations to exploit the temporal\nredundancy across the frames in videos inspired by the standard video codecs.\nFurthermore, we introduce a fully convolutional architecture, enabled by\none-dimensional temporal grids, improving the continuity of spatial features.\nExperimental results show that FFNeRV yields the best performance for video\ncompression and frame interpolation among the methods using frame-wise\nrepresentations or neural fields. To reduce the model size even further, we\ndevise a more compact convolutional architecture using the group and pointwise\nconvolutions. With model compression techniques, including quantization-aware\ntraining and entropy coding, FFNeRV outperforms widely-used standard video\ncodecs (H.264 and HEVC) and performs on par with state-of-the-art video\ncompression algorithms.\n","authors":["Joo Chan Lee","Daniel Rho","Jong Hwan Ko","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2212.12294v2.pdf","comment":"Our project page including code is available at\n  https://maincold2.github.io/ffnerv/"},{"id":"http://arxiv.org/abs/2206.02659v5","updated":"2023-08-07T01:20:01Z","published":"2022-06-06T14:52:46Z","title":"Robust Fine-Tuning of Deep Neural Networks with Hessian-based\n  Generalization Guarantees","summary":"  We consider fine-tuning a pretrained deep neural network on a target task. We\nstudy the generalization properties of fine-tuning to understand the problem of\noverfitting, which has often been observed (e.g., when the target dataset is\nsmall or when the training labels are noisy). Existing generalization measures\nfor deep networks depend on notions such as distance from the initialization\n(i.e., the pretrained network) of the fine-tuned model and noise stability\nproperties of deep networks. This paper identifies a Hessian-based distance\nmeasure through PAC-Bayesian analysis, which is shown to correlate well with\nobserved generalization gaps of fine-tuned models. Theoretically, we prove\nHessian distance-based generalization bounds for fine-tuned models. We also\ndescribe an extended study of fine-tuning against label noise, where\noverfitting is against a critical problem; We present an algorithm and a\ngeneralization error guarantee for this algorithm under a class conditional\nindependent noise model. Empirically, we observe that the Hessian-based\ndistance measure can match the scale of the observed generalization gap of\nfine-tuned models in practice. We also test our algorithm on several image\nclassification tasks with noisy training labels, showing notable gains over\nprior methods, and the Hessian distance measure of the fine-tuned model\ndecreases substantially.\n","authors":["Haotian Ju","Dongyue Li","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.02659v5.pdf","comment":"37 pages. Appeared in ICML 2022"},{"id":"http://arxiv.org/abs/2308.03235v1","updated":"2023-08-07T01:10:50Z","published":"2023-08-07T01:10:50Z","title":"Analysis of the Evolution of Advanced Transformer-Based Language Models:\n  Experiments on Opinion Mining","summary":"  Opinion mining, also known as sentiment analysis, is a subfield of natural\nlanguage processing (NLP) that focuses on identifying and extracting subjective\ninformation in textual material. This can include determining the overall\nsentiment of a piece of text (e.g., positive or negative), as well as\nidentifying specific emotions or opinions expressed in the text, that involves\nthe use of advanced machine and deep learning techniques. Recently,\ntransformer-based language models make this task of human emotion analysis\nintuitive, thanks to the attention mechanism and parallel computation. These\nadvantages make such models very powerful on linguistic tasks, unlike recurrent\nneural networks that spend a lot of time on sequential processing, making them\nprone to fail when it comes to processing long text. The scope of our paper\naims to study the behaviour of the cutting-edge Transformer-based language\nmodels on opinion mining and provide a high-level comparison between them to\nhighlight their key particularities. Additionally, our comparative study shows\nleads and paves the way for production engineers regarding the approach to\nfocus on and is useful for researchers as it provides guidelines for future\nresearch subjects.\n","authors":["Nour Eddine Zekaoui","Siham Yousfi","Maryem Rhanoui","Mounia Mikram"],"pdf_url":"https://arxiv.org/pdf/2308.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03231v1","updated":"2023-08-07T00:30:29Z","published":"2023-08-07T00:30:29Z","title":"Imbalanced Large Graph Learning Framework for FPGA Logic Elements\n  Packing Prediction","summary":"  Packing is a required step in a typical FPGA CAD flow. It has high impacts to\nthe performance of FPGA placement and routing. Early prediction of packing\nresults can guide design optimization and expedite design closure. In this\nwork, we propose an imbalanced large graph learning framework, ImLG, for\nprediction of whether logic elements will be packed after placement.\nSpecifically, we propose dedicated feature extraction and feature aggregation\nmethods to enhance the node representation learning of circuit graphs. With\nimbalanced distribution of packed and unpacked logic elements, we further\npropose techniques such as graph oversampling and mini-batch training for this\nimbalanced learning task in large circuit graphs. Experimental results\ndemonstrate that our framework can improve the F1 score by 42.82% compared to\nthe most recent Gaussian-based prediction method. Physical design results show\nthat the proposed method can assist the placer in improving routed wirelength\nby 0.93% and SLICE occupation by 0.89%.\n","authors":["Zhixiong Di","Runzhe Tao","Lin Chen","Qiang Wu","Yibo Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03230v1","updated":"2023-08-07T00:14:46Z","published":"2023-08-07T00:14:46Z","title":"Tractability of approximation by general shallow networks","summary":"  In this paper, we present a sharper version of the results in the paper\nDimension independent bounds for general shallow networks; Neural Networks,\n\\textbf{123} (2020), 142-152. Let $\\mathbb{X}$ and $\\mathbb{Y}$ be compact\nmetric spaces. We consider approximation of functions of the form $\nx\\mapsto\\int_{\\mathbb{Y}} G( x, y)d\\tau( y)$, $ x\\in\\mathbb{X}$, by\n$G$-networks of the form $ x\\mapsto \\sum_{k=1}^n a_kG( x, y_k)$, $ y_1,\\cdots,\ny_n\\in\\mathbb{Y}$, $a_1,\\cdots, a_n\\in\\mathbb{R}$. Defining the dimensions of\n$\\mathbb{X}$ and $\\mathbb{Y}$ in terms of covering numbers, we obtain dimension\nindependent bounds on the degree of approximation in terms of $n$, where also\nthe constants involved are all dependent at most polynomially on the\ndimensions. Applications include approximation by power rectified linear unit\nnetworks, zonal function networks, certain radial basis function networks as\nwell as the important problem of function extension to higher dimensional\nspaces.\n","authors":["Hrushikesh Mhaskar","Tong Mao"],"pdf_url":"https://arxiv.org/pdf/2308.03230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03960v1","updated":"2023-08-07T23:52:03Z","published":"2023-08-07T23:52:03Z","title":"Amortized Global Search for Efficient Preliminary Trajectory Design with\n  Deep Generative Models","summary":"  Preliminary trajectory design is a global search problem that seeks multiple\nqualitatively different solutions to a trajectory optimization problem. Due to\nits high dimensionality and non-convexity, and the frequent adjustment of\nproblem parameters, the global search becomes computationally demanding. In\nthis paper, we exploit the clustering structure in the solutions and propose an\namortized global search (AmorGS) framework. We use deep generative models to\npredict trajectory solutions that share similar structures with previously\nsolved problems, which accelerates the global search for unseen parameter\nvalues. Our method is evaluated using De Jong's 5th function and a low-thrust\ncircular restricted three-body problem.\n","authors":["Anjian Li","Amlan Sinha","Ryne Beeson"],"pdf_url":"https://arxiv.org/pdf/2308.03960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03956v1","updated":"2023-08-07T23:46:14Z","published":"2023-08-07T23:46:14Z","title":"Fixed Inter-Neuron Covariability Induces Adversarial Robustness","summary":"  The vulnerability to adversarial perturbations is a major flaw of Deep Neural\nNetworks (DNNs) that raises question about their reliability when in real-world\nscenarios. On the other hand, human perception, which DNNs are supposed to\nemulate, is highly robust to such perturbations, indicating that there may be\ncertain features of the human perception that make it robust but are not\nrepresented in the current class of DNNs. One such feature is that the activity\nof biological neurons is correlated and the structure of this correlation tends\nto be rather rigid over long spans of times, even if it hampers performance and\nlearning. We hypothesize that integrating such constraints on the activations\nof a DNN would improve its adversarial robustness, and, to test this\nhypothesis, we have developed the Self-Consistent Activation (SCA) layer, which\ncomprises of neurons whose activations are consistent with each other, as they\nconform to a fixed, but learned, covariability pattern. When evaluated on image\nand sound recognition tasks, the models with a SCA layer achieved high\naccuracy, and exhibited significantly greater robustness than multi-layer\nperceptron models to state-of-the-art Auto-PGD adversarial attacks\n\\textit{without being trained on adversarially perturbed data\n","authors":["Muhammad Ahmed Shah","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2308.03956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03953v1","updated":"2023-08-07T23:44:35Z","published":"2023-08-07T23:44:35Z","title":"PMU measurements based short-term voltage stability assessment of power\n  systems via deep transfer learning","summary":"  Deep learning has emerged as an effective solution for addressing the\nchallenges of short-term voltage stability assessment (STVSA) in power systems.\nHowever, existing deep learning-based STVSA approaches face limitations in\nadapting to topological changes, sample labeling, and handling small datasets.\nTo overcome these challenges, this paper proposes a novel phasor measurement\nunit (PMU) measurements-based STVSA method by using deep transfer learning. The\nmethod leverages the real-time dynamic information captured by PMUs to create\nan initial dataset. It employs temporal ensembling for sample labeling and\nutilizes least squares generative adversarial networks (LSGAN) for data\naugmentation, enabling effective deep learning on small-scale datasets.\nAdditionally, the method enhances adaptability to topological changes by\nexploring connections between different faults. Experimental results on the\nIEEE 39-bus test system demonstrate that the proposed method improves model\nevaluation accuracy by approximately 20% through transfer learning, exhibiting\nstrong adaptability to topological changes. Leveraging the self-attention\nmechanism of the Transformer model, this approach offers significant advantages\nover shallow learning methods and other deep learning-based approaches.\n","authors":["Yang Li","Shitu Zhang","Yuanzheng Li","Jiting Cao","Shuyue Jia"],"pdf_url":"https://arxiv.org/pdf/2308.03953v1.pdf","comment":"Accepted by IEEE Transactions on Instrumentation & Measurement"},{"id":"http://arxiv.org/abs/2308.03945v1","updated":"2023-08-07T23:27:20Z","published":"2023-08-07T23:27:20Z","title":"The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning\n  with Transformers","summary":"  Federated learning (FL) addresses data privacy concerns by enabling\ncollaborative training of AI models across distributed data owners. Wide\nadoption of FL faces the fundamental challenges of data heterogeneity and the\nlarge scale of data owners involved. In this paper, we investigate the prospect\nof Transformer-based FL models for achieving generalization and personalization\nin this setting. We conduct extensive comparative experiments involving FL with\nTransformers, ResNet, and personalized ResNet-based FL approaches under various\nscenarios. These experiments consider varying numbers of data owners to\ndemonstrate Transformers' advantages over deep neural networks in large-scale\nheterogeneous FL tasks. In addition, we analyze the superior performance of\nTransformers by comparing the Centered Kernel Alignment (CKA) representation\nsimilarity across different layers and FL models to gain insight into the\nreasons behind their promising capabilities.\n","authors":["Yulan Gao","Hao Sun","Zengxiang Li","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03944v1","updated":"2023-08-07T23:19:34Z","published":"2023-08-07T23:19:34Z","title":"GraPhSyM: Graph Physical Synthesis Model","summary":"  In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model\nfor fast and accurate estimation of post-physical synthesis circuit delay and\narea metrics from pre-physical synthesis circuit netlists. Once trained,\nGraPhSyM provides accurate visibility of final design metrics to early EDA\nstages, such as logic synthesis, without running the slow physical synthesis\nflow, enabling global co-optimization across stages. Additionally, the swift\nand precise feedback provided by GraPhSym is instrumental for\nmachine-learning-based EDA optimization frameworks. Given a gate-level netlist\nof a circuit represented as a graph, GraPhSyM utilizes graph structure,\nconnectivity, and electrical property features to predict the impact of\nphysical synthesis transformations such as buffer insertion and gate sizing.\nWhen trained on a dataset of 6000 prefix adder designs synthesized at an\naggressive delay target, GraPhSyM can accurately predict the post-synthesis\ndelay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s\ninference time. Furthermore, we illustrate the compositionality of GraPhSyM by\nemploying the model trained on a fixed delay target to accurately anticipate\npost-synthesis metrics at a variety of unseen delay targets. Lastly, we report\npromising generalization capabilities of the GraPhSyM model when it is\nevaluated on circuits different from the adders it was exclusively trained on.\nThe results show the potential for GraPhSyM to serve as a powerful tool for\nadvanced optimization techniques and as an oracle for EDA machine learning\nframeworks.\n","authors":["Ahmed Agiza","Rajarshi Roy","Teodor Dumitru Ene","Saad Godil","Sherief Reda","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.03944v1.pdf","comment":"Accepted at ICCAD'23"},{"id":"http://arxiv.org/abs/2308.00824v2","updated":"2023-08-07T22:47:33Z","published":"2023-08-01T20:22:53Z","title":"An Exact Kernel Equivalence for Finite Classification Models","summary":"  We explore the equivalence between neural networks and kernel methods by\nderiving the first exact representation of any finite-size parametric\nclassification model trained with gradient descent as a kernel machine. We\ncompare our exact representation to the well-known Neural Tangent Kernel (NTK)\nand discuss approximation error relative to the NTK and other non-exact path\nkernel formulations. We experimentally demonstrate that the kernel can be\ncomputed for realistic networks up to machine precision. We use this exact\nkernel to show that our theoretical contribution can provide useful insights\ninto the predictions made by neural networks, particularly the way in which\nthey generalize.\n","authors":["Brian Bell","Michael Geyer","David Glickenstein","Amanda Fernandez","Juston Moore"],"pdf_url":"https://arxiv.org/pdf/2308.00824v2.pdf","comment":"TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in\n  Appendix"},{"id":"http://arxiv.org/abs/2204.01248v2","updated":"2023-08-07T22:21:24Z","published":"2022-04-04T05:27:40Z","title":"Differentiable Rendering for Synthetic Aperture Radar Imagery","summary":"  There is rising interest in differentiable rendering, which allows explicitly\nmodeling geometric priors and constraints in optimization pipelines using\nfirst-order methods such as backpropagation. Incorporating such domain\nknowledge can lead to deep neural networks that are trained more robustly and\nwith limited data, as well as the capability to solve ill-posed inverse\nproblems. Existing efforts in differentiable rendering have focused on imagery\nfrom electro-optical sensors, particularly conventional RGB-imagery. In this\nwork, we propose an approach for differentiable rendering of Synthetic Aperture\nRadar (SAR) imagery, which combines methods from 3D computer graphics with\nneural rendering. We demonstrate the approach on the inverse graphics problem\nof 3D Object Reconstruction from limited SAR imagery using high-fidelity\nsimulated SAR data.\n","authors":["Michael Wilmanski","Jonathan Tamir"],"pdf_url":"https://arxiv.org/pdf/2204.01248v2.pdf","comment":"This version of the manuscript is an updated preprint which has been\n  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but\n  has not yet been published or processed by IEEE"},{"id":"http://arxiv.org/abs/2308.03928v1","updated":"2023-08-07T22:12:48Z","published":"2023-08-07T22:12:48Z","title":"Optimizing the switching operation in monoclonal antibody production:\n  Economic MPC and reinforcement learning","summary":"  Monoclonal antibodies (mAbs) have emerged as indispensable assets in\nmedicine, and are currently at the forefront of biopharmaceutical product\ndevelopment. However, the growing market demand and the substantial doses\nrequired for mAb clinical treatments necessitate significant progress in its\nlarge-scale production. Most of the processes for industrial mAb production\nrely on batch operations, which result in significant downtime. The shift\ntowards a fully continuous and integrated manufacturing process holds the\npotential to boost product yield and quality, while eliminating the extra\nexpenses associated with storing intermediate products. The integrated\ncontinuous mAb production process can be divided into the upstream and\ndownstream processes. One crucial aspect that ensures the continuity of the\nintegrated process is the switching of the capture columns, which are typically\nchromatography columns operated in a fed-batch manner downstream. Due to the\ndiscrete nature of the switching operation, advanced process control algorithms\nsuch as economic MPC (EMPC) are computationally difficult to implement. This is\nbecause an integer nonlinear program (INLP) needs to be solved online at each\nsampling time. This paper introduces two computationally-efficient approaches\nfor EMPC implementation, namely, a sigmoid function approximation approach and\na rectified linear unit (ReLU) approximation approach. It also explores the\napplication of deep reinforcement learning (DRL). These three methods are\ncompared to the traditional switching approach which is based on a 1% product\nbreakthrough rule and which involves no optimization.\n","authors":["Sandra A. Obiri","Song Bo","Bernard T. Agyeman","Benjamin Decardi-Nelson","Jinfeng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10331v3","updated":"2023-08-07T22:07:04Z","published":"2023-02-20T21:54:25Z","title":"Causal Razors","summary":"  When performing causal discovery, assumptions have to be made on how the true\ncausal mechanism corresponds to the underlying joint probability distribution.\nThese assumptions are labeled as causal razors in this work. We review numerous\ncausal razors that appeared in the literature, and offer a comprehensive\nlogical comparison of them. In particular, we scrutinize an unpopular causal\nrazor, namely parameter minimality, in multinomial causal models and its\nlogical relations with other well-studied causal razors. Our logical result\nposes a dilemma in selecting a reasonable scoring criterion for score-based\ncasual search algorithms.\n","authors":["Wai-yin Lam"],"pdf_url":"https://arxiv.org/pdf/2302.10331v3.pdf","comment":"29 pages for the main paper. 14 pages for the supplementary materials"},{"id":"http://arxiv.org/abs/2308.02013v2","updated":"2023-08-07T21:34:44Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Ramesh","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v2.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2308.03915v1","updated":"2023-08-07T21:20:24Z","published":"2023-08-07T21:20:24Z","title":"Predicting and explaining nonlinear material response using deep\n  Physically Guided Neural Networks with Internal Variables","summary":"  Nonlinear materials are often difficult to model with classical state model\ntheory because they have a complex and sometimes inaccurate physical and\nmathematical description or we simply do not know how to describe such\nmaterials in terms of relations between external and internal variables. In\nmany disciplines, Neural Network methods have arisen as powerful tools to\nidentify very complex and non-linear correlations. In this work, we use the\nvery recently developed concept of Physically Guided Neural Networks with\nInternal Variables (PGNNIV) to discover constitutive laws using a model-free\napproach and training solely with measured force-displacement data. PGNNIVs\nmake a particular use of the physics of the problem to enforce constraints on\nspecific hidden layers and are able to make predictions without internal\nvariable data. We demonstrate that PGNNIVs are capable of predicting both\ninternal and external variables under unseen load scenarios, regardless of the\nnature of the material considered (linear, with hardening or softening behavior\nand hyperelastic), unravelling the constitutive law of the material hence\nexplaining its nature altogether, placing the method in what is known as\neXplainable Artificial Intelligence (XAI).\n","authors":["Javier Orera-Echeverria","Jacobo Ayensa-Jiménez","Manuel Doblare"],"pdf_url":"https://arxiv.org/pdf/2308.03915v1.pdf","comment":"Main text: 25 pages, 6 figures. Appendices: 13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2112.04629v4","updated":"2023-08-07T21:06:18Z","published":"2021-12-09T00:08:09Z","title":"Transferability Properties of Graph Neural Networks","summary":"  Graph neural networks (GNNs) are composed of layers consisting of graph\nconvolutions and pointwise nonlinearities. Due to their invariance and\nstability properties, GNNs are provably successful at learning representations\nfrom data supported on moderate-scale graphs. However, they are difficult to\nlearn on large-scale graphs. In this paper, we study the problem of training\nGNNs on graphs of moderate size and transferring them to large-scale graphs. We\nuse graph limits called graphons to define limit objects for graph filters and\nGNNs -- graphon filters and graphon neural networks (WNNs) -- which we\ninterpret as generative models for graph filters and GNNs. We then show that\ngraphon filters and WNNs can be approximated by graph filters and GNNs sampled\nfrom them on weighted and stochastic graphs. Because the error of these\napproximations can be upper bounded, by a triangle inequality argument we can\nfurther bound the error of transferring a graph filter or a GNN across graphs.\nOur results show that (i) the transference error decreases with the graph size,\nand (ii) that graph filters have a transferability-discriminability tradeoff\nthat in GNNs is alleviated by the scattering behavior of the nonlinearity.\nThese findings are demonstrated empirically in a movie recommendation problem\nand in a decentralized control task.\n","authors":["Luana Ruiz","Luiz F. O. Chamon","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2112.04629v4.pdf","comment":"IEEE TSP"},{"id":"http://arxiv.org/abs/2308.03908v1","updated":"2023-08-07T20:50:54Z","published":"2023-08-07T20:50:54Z","title":"ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings\n  for Video Action Recognition","summary":"  Video Action Recognition (VAR) is a challenging task due to its inherent\ncomplexities. Though different approaches have been explored in the literature,\ndesigning a unified framework to recognize a large number of human actions is\nstill a challenging problem. Recently, Multi-Modal Learning (MML) has\ndemonstrated promising results in this domain. In literature, 2D skeleton or\npose modality has often been used for this task, either independently or in\nconjunction with the visual information (RGB modality) present in videos.\nHowever, the combination of pose, visual information, and text attributes has\nnot been explored yet, though text and pose attributes independently have been\nproven to be effective in numerous computer vision tasks. In this paper, we\npresent the first pose augmented Vision-language model (VLM) for VAR. Notably,\nour scheme achieves an accuracy of 92.81% and 73.02% on two popular human video\naction recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even\nwithout any video data pre-training, and an accuracy of 96.11% and 75.75% after\nkinetics pre-training.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.03908v1.pdf","comment":"7 pages, 3 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2308.03907v1","updated":"2023-08-07T20:50:48Z","published":"2023-08-07T20:50:48Z","title":"Advancements In Crowd-Monitoring System: A Comprehensive Analysis of\n  Systematic Approaches and Automation Algorithms: State-of-The-Art","summary":"  Growing apprehensions surrounding public safety have captured the attention\nof numerous governments and security agencies across the globe. These entities\nare increasingly acknowledging the imperative need for reliable and secure\ncrowd-monitoring systems to address these concerns. Effectively managing human\ngatherings necessitates proactive measures to prevent unforeseen events or\ncomplications, ensuring a safe and well-coordinated environment. The scarcity\nof research focusing on crowd monitoring systems and their security\nimplications has given rise to a burgeoning area of investigation, exploring\npotential approaches to safeguard human congregations effectively. Crowd\nmonitoring systems depend on a bifurcated approach, encompassing vision-based\nand non-vision-based technologies. An in-depth analysis of these two\nmethodologies will be conducted in this research. The efficacy of these\napproaches is contingent upon the specific environment and temporal context in\nwhich they are deployed, as they each offer distinct advantages. This paper\nendeavors to present an in-depth analysis of the recent incorporation of\nartificial intelligence (AI) algorithms and models into automated systems,\nemphasizing their contemporary applications and effectiveness in various\ncontexts.\n","authors":["Mohammed Ameen","Richard Stone"],"pdf_url":"https://arxiv.org/pdf/2308.03907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03905v1","updated":"2023-08-07T20:43:42Z","published":"2023-08-07T20:43:42Z","title":"Intelligent Assistant Language Understanding On Device","summary":"  It has recently become feasible to run personal digital assistants on phones\nand other personal devices. In this paper we describe a design for a natural\nlanguage understanding system that runs on device. In comparison to a\nserver-based assistant, this system is more private, more reliable, faster,\nmore expressive, and more accurate. We describe what led to key choices about\narchitecture and technologies. For example, some approaches in the dialog\nsystems literature are difficult to maintain over time in a deployment setting.\nWe hope that sharing learnings from our practical experiences may help inform\nfuture work in the research community.\n","authors":["Cecilia Aas","Hisham Abdelsalam","Irina Belousova","Shruti Bhargava","Jianpeng Cheng","Robert Daland","Joris Driesen","Federico Flego","Tristan Guigue","Anders Johannsen","Partha Lal","Jiarui Lu","Joel Ruben Antony Moniz","Nathan Perkins","Dhivya Piraviperumal","Stephen Pulman","Diarmuid Ó Séaghdha","David Q. Sun","John Torr","Marco Del Vecchio","Jay Wacker","Jason D. Williams","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03904v1","updated":"2023-08-07T20:41:19Z","published":"2023-08-07T20:41:19Z","title":"On genuine invariance learning without weight-tying","summary":"  In this paper, we investigate properties and limitations of invariance\nlearned by neural networks from the data compared to the genuine invariance\nachieved through invariant weight-tying. To do so, we adopt a group theoretical\nperspective and analyze invariance learning in neural networks without\nweight-tying constraints. We demonstrate that even when a network learns to\ncorrectly classify samples on a group orbit, the underlying decision-making in\nsuch a model does not attain genuine invariance. Instead, learned invariance is\nstrongly conditioned on the input data, rendering it unreliable if the input\ndistribution shifts. We next demonstrate how to guide invariance learning\ntoward genuine invariance by regularizing the invariance of a model at the\ntraining. To this end, we propose several metrics to quantify learned\ninvariance: (i) predictive distribution invariance, (ii) logit invariance, and\n(iii) saliency invariance similarity. We show that the invariance learned with\nthe invariance error regularization closely reassembles the genuine invariance\nof weight-tying models and reliably holds even under a severe input\ndistribution shift. Closer analysis of the learned invariance also reveals the\nspectral decay phenomenon, when a network chooses to achieve the invariance to\na specific transformation group by reducing the sensitivity to any input\nperturbation.\n","authors":["Artem Moskalev","Anna Sepliarskaia","Erik J. Bekkers","Arnold Smeulders"],"pdf_url":"https://arxiv.org/pdf/2308.03904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03901v1","updated":"2023-08-07T20:28:22Z","published":"2023-08-07T20:28:22Z","title":"FLIPS: Federated Learning using Intelligent Participant Selection","summary":"  This paper presents the design and implementation of FLIPS, a middleware\nsystem to manage data and participant heterogeneity in federated learning (FL)\ntraining workloads. In particular, we examine the benefits of label\ndistribution clustering on participant selection in federated learning. FLIPS\nclusters parties involved in an FL training job based on the label distribution\nof their data apriori, and during FL training, ensures that each cluster is\nequitably represented in the participants selected. FLIPS can support the most\ncommon FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To\nmanage platform heterogeneity and dynamic resource availability, FLIPS\nincorporates a straggler management mechanism to handle changing capacities in\ndistributed, smart community applications. Privacy of label distributions,\nclustering and participant selection is ensured through a trusted execution\nenvironment (TEE). Our comprehensive empirical evaluation compares FLIPS with\nrandom participant selection, as well as two other \"smart\" selection mechanisms\n- Oort and gradient clustering using two real-world datasets, two different\nnon-IID distributions and three common FL algorithms (FedYogi, FedProx and\nFedAvg). We demonstrate that FLIPS significantly improves convergence,\nachieving higher accuracy by 17 - 20 % with 20 - 60 % lower communication\ncosts, and these benefits endure in the presence of straggler participants.\n","authors":["Rahul Atul Bhope","K. R. Jayaram","Nalini Venkatasubramanian","Ashish Verma","Gegi Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.03901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08496v2","updated":"2023-08-07T20:27:19Z","published":"2023-07-17T13:59:07Z","title":"Can We Trust Race Prediction?","summary":"  In the absence of sensitive race and ethnicity data, researchers, regulators,\nand firms alike turn to proxies. In this paper, I train a Bidirectional Long\nShort-Term Memory (BiLSTM) model on a novel dataset of voter registration data\nfrom all 50 US states and create an ensemble that achieves up to 36.8% higher\nout of sample (OOS) F1 scores than the best performing machine learning models\nin the literature. Additionally, I construct the most comprehensive database of\nfirst and surname distributions in the US in order to improve the coverage and\naccuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved\nFirstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality\nbenchmark dataset in order to fairly compare existing models and aid future\nmodel developers.\n","authors":["Cangyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.08496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13452v3","updated":"2023-08-07T19:57:38Z","published":"2023-05-22T19:52:08Z","title":"Measuring and Modeling Physical Intrinsic Motivation","summary":"  Humans are interactive agents driven to seek out situations with interesting\nphysical dynamics. Here we formalize the functional form of physical intrinsic\nmotivation. We first collect ratings of how interesting humans find a variety\nof physics scenarios. We then model human interestingness responses by\nimplementing various hypotheses of intrinsic motivation including models that\nrely on simple scene features to models that depend on forward physics\nprediction. We find that the single best predictor of human responses is\nadversarial reward, a model derived from physical prediction loss. We also find\nthat simple scene feature models do not generalize their prediction of human\nresponses across all scenarios. Finally, linearly combining the adversarial\nmodel with the number of collisions in a scene leads to the greatest\nimprovement in predictivity of human responses, suggesting humans are driven\ntowards scenarios that result in high information gain and physical activity.\n","authors":["Julio Martinez","Felix Binder","Haoliang Wang","Nick Haber","Judith Fan","Daniel L. K. Yamins"],"pdf_url":"https://arxiv.org/pdf/2305.13452v3.pdf","comment":"6 pages, 5 figures, accepted to CogSci 2023 with full paper\n  publication in the proceedings"},{"id":"http://arxiv.org/abs/2305.02640v3","updated":"2023-08-07T19:55:10Z","published":"2023-05-04T08:20:37Z","title":"Towards Causal Representation Learning and Deconfounding from Indefinite\n  Data","summary":"  We redefine causal data from two novel perspectives: the number of causal\nskeletons and the dimension of causal variables, thereby proposing three data\nparadigms. Among them, the indefinite data (like dialogues or video sources) is\ncharacterized by multi-skeleton structures and multi-value variables. Multi\nskeletons induce low sample utilization, and multi values induce incapability\nof the distribution assumption, both leading to the fact that learning causal\nrepresentation from indefinite data is, as of yet, largely unexplored. We\ndesign the causal strength variational model to settle down these two problems.\nSpecifically, we leverage the causal strength instead of independent noise as\nthe latent variable to construct evidence lower bound. By this design ethos,\nThe causal strengths of different skeletons are regarded as a distribution and\ncan be expressed as a single-valued causal graph matrix. Moreover, considering\nthe latent confounders, we disentangle the causal graph G into two relation\nsubgraphs O and C. O contains pure relations between observed variables, while\nC represents the relations from latent variables to observed variables. We\nimplement the above designs as a dynamic variational inference model, tailored\nto learn causal representation from indefinite data under latent confounding.\nFinally, we conduct comprehensive experiments on synthetic and real-world data\nto demonstrate the effectiveness of our method.\n","authors":["Hang Chen","Xinyu Yang","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.02640v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03892v1","updated":"2023-08-07T19:51:10Z","published":"2023-08-07T19:51:10Z","title":"Scalable and Equitable Math Problem Solving Strategy Prediction in Big\n  Educational Data","summary":"  Understanding a student's problem-solving strategy can have a significant\nimpact on effective math learning using Intelligent Tutoring Systems (ITSs) and\nAdaptive Instructional Systems (AISs). For instance, the ITS/AIS can better\npersonalize itself to correct specific misconceptions that are indicated by\nincorrect strategies, specific problems can be designed to improve strategies\nand frustration can be minimized by adapting to a student's natural way of\nthinking rather than trying to fit a standard strategy for all. While it may be\npossible for human experts to identify strategies manually in classroom\nsettings with sufficient student interaction, it is not possible to scale this\nup to big data. Therefore, we leverage advances in Machine Learning and AI\nmethods to perform scalable strategy prediction that is also fair to students\nat all skill levels. Specifically, we develop an embedding called MVec where we\nlearn a representation based on the mastery of students. We then cluster these\nembeddings with a non-parametric clustering method where we progressively learn\nclusters such that we group together instances that have approximately\nsymmetrical strategies. The strategy prediction model is trained on instances\nsampled from these clusters. This ensures that we train the model over diverse\nstrategies and also that strategies from a particular group do not bias the DNN\nmodel, thus allowing it to optimize its parameters over all groups. Using real\nworld large-scale student interaction datasets from MATHia, we implement our\napproach using transformers and Node2Vec for learning the mastery embeddings\nand LSTMs for predicting strategies. We show that our approach can scale up to\nachieve high accuracy by training on a small sample of a large dataset and also\nhas predictive equality, i.e., it can predict strategies equally well for\nlearners at diverse skill levels.\n","authors":["Anup Shakya","Vasile Rus","Deepak Venugopal"],"pdf_url":"https://arxiv.org/pdf/2308.03892v1.pdf","comment":"12 pages, 7 figures Published as a full paper in the 16th\n  International Conference on Educational Data Mining 2023"},{"id":"http://arxiv.org/abs/2301.00790v3","updated":"2023-08-07T19:44:14Z","published":"2022-12-30T17:19:00Z","title":"Online learning techniques for prediction of temporal tabular datasets\n  with regime changes","summary":"  The application of deep learning to non-stationary temporal datasets can lead\nto overfitted models that underperform under regime changes. In this work, we\npropose a modular machine learning pipeline for ranking predictions on temporal\npanel datasets which is robust under regime changes. The modularity of the\npipeline allows the use of different models, including Gradient Boosting\nDecision Trees (GBDTs) and Neural Networks, with and without feature\nengineering. We evaluate our framework on financial data for stock portfolio\nprediction, and find that GBDT models with dropout display high performance,\nrobustness and generalisability with reduced complexity and computational cost.\nWe then demonstrate how online learning techniques, which require no retraining\nof models, can be used post-prediction to enhance the results. First, we show\nthat dynamic feature projection improves robustness by reducing drawdown in\nregime changes. Second, we demonstrate that dynamical model ensembling based on\nselection of models with good recent performance leads to improved Sharpe and\nCalmar ratios of out-of-sample predictions. We also evaluate the robustness of\nour pipeline across different data splits and random seeds with good\nreproducibility.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2301.00790v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03883v1","updated":"2023-08-07T19:26:09Z","published":"2023-08-07T19:26:09Z","title":"Generative Benchmark Creation for Table Union Search","summary":"  Data management has traditionally relied on synthetic data generators to\ngenerate structured benchmarks, like the TPC suite, where we can control\nimportant parameters like data size and its distribution precisely. These\nbenchmarks were central to the success and adoption of database management\nsystems. But more and more, data management problems are of a semantic nature.\nAn important example is finding tables that can be unioned. While any two\ntables with the same cardinality can be unioned, table union search is the\nproblem of finding tables whose union is semantically coherent. Semantic\nproblems cannot be benchmarked using synthetic data. Our current methods for\ncreating benchmarks involve the manual curation and labeling of real data.\nThese methods are not robust or scalable and perhaps more importantly, it is\nnot clear how robust the created benchmarks are. We propose to use generative\nAI models to create structured data benchmarks for table union search. We\npresent a novel method for using generative models to create tables with\nspecified properties. Using this method, we create a new benchmark containing\npairs of tables that are both unionable and non-unionable but related. We\nthoroughly evaluate recent existing table union search methods over existing\nbenchmarks and our new benchmark. We also present and evaluate a new table\nsearch methods based on recent large language models over all benchmarks. We\nshow that the new benchmark is more challenging for all methods than\nhand-curated benchmarks, specifically, the top-performing method achieves a\nMean Average Precision of around 60%, over 30% less than its performance on\nexisting manually created benchmarks. We examine why this is the case and show\nthat the new benchmark permits more detailed analysis of methods, including a\nstudy of both false positives and false negatives that were not possible with\nexisting benchmarks.\n","authors":["Koyena Pal","Aamod Khatiwada","Roee Shraga","Renée J. Miller"],"pdf_url":"https://arxiv.org/pdf/2308.03883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03882v1","updated":"2023-08-07T19:24:47Z","published":"2023-08-07T19:24:47Z","title":"Exploiting Generalization in Offline Reinforcement Learning via Unseen\n  State Augmentations","summary":"  Offline reinforcement learning (RL) methods strike a balance between\nexploration and exploitation by conservative value estimation -- penalizing\nvalues of unseen states and actions. Model-free methods penalize values at all\nunseen actions, while model-based methods are able to further exploit unseen\nstates via model rollouts. However, such methods are handicapped in their\nability to find unseen states far away from the available offline data due to\ntwo factors -- (a) very short rollout horizons in models due to cascading model\nerrors, and (b) model rollouts originating solely from states observed in\noffline data. We relax the second assumption and present a novel unseen state\naugmentation strategy to allow exploitation of unseen states where the learned\nmodel and value estimates generalize. Our strategy finds unseen states by\nvalue-informed perturbations of seen states followed by filtering out states\nwith epistemic uncertainty estimates too high (high error) or too low (too\nsimilar to seen data). We observe improved performance in several offline RL\ntasks and find that our augmentation strategy consistently leads to overall\nlower average dataset Q-value estimates i.e. more conservative Q-value\nestimates than a baseline.\n","authors":["Nirbhay Modhe","Qiaozi Gao","Ashwin Kalyan","Dhruv Batra","Govind Thattai","Gaurav Sukhatme"],"pdf_url":"https://arxiv.org/pdf/2308.03882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03873v1","updated":"2023-08-07T18:50:57Z","published":"2023-08-07T18:50:57Z","title":"Evaluating and Explaining Large Language Models for Code Using Syntactic\n  Structures","summary":"  Large Language Models (LLMs) for code are a family of high-parameter,\ntransformer-based neural networks pre-trained on massive datasets of both\nnatural and programming languages. These models are rapidly being employed in\ncommercial AI-based developer tools, such as GitHub CoPilot. However, measuring\nand explaining their effectiveness on programming tasks is a challenging\nproposition, given their size and complexity. The methods for evaluating and\nexplaining LLMs for code are inextricably linked. That is, in order to explain\na model's predictions, they must be reliably mapped to fine-grained,\nunderstandable concepts. Once this mapping is achieved, new methods for\ndetailed model evaluations are possible. However, most current explainability\ntechniques and evaluation benchmarks focus on model robustness or individual\ntask performance, as opposed to interpreting model predictions.\n  To this end, this paper introduces ASTxplainer, an explainability method\nspecific to LLMs for code that enables both new methods for LLM evaluation and\nvisualizations of LLM predictions that aid end-users in understanding model\npredictions. At its core, ASTxplainer provides an automated method for aligning\ntoken predictions with AST nodes, by extracting and aggregating normalized\nmodel logits within AST structures. To demonstrate the practical benefit of\nASTxplainer, we illustrate the insights that our framework can provide by\nperforming an empirical evaluation on 12 popular LLMs for code using a curated\ndataset of the most popular GitHub projects. Additionally, we perform a user\nstudy examining the usefulness of an ASTxplainer-derived visualization of model\npredictions aimed at enabling model users to explain predictions. The results\nof these studies illustrate the potential for ASTxplainer to provide insights\ninto LLM effectiveness, and aid end-users in understanding predictions.\n","authors":["David N Palacio","Alejandro Velasco","Daniel Rodriguez-Cardenas","Kevin Moran","Denys Poshyvanyk"],"pdf_url":"https://arxiv.org/pdf/2308.03873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03854v1","updated":"2023-08-07T18:04:12Z","published":"2023-08-07T18:04:12Z","title":"Revisiting Prompt Engineering via Declarative Crowdsourcing","summary":"  Large language models (LLMs) are incredibly powerful at comprehending and\ngenerating data in the form of text, but are brittle and error-prone. There has\nbeen an advent of toolkits and recipes centered around so-called prompt\nengineering-the process of asking an LLM to do something via a series of\nprompts. However, for LLM-powered data processing workflows, in particular,\noptimizing for quality, while keeping cost bounded, is a tedious, manual\nprocess. We put forth a vision for declarative prompt engineering. We view LLMs\nlike crowd workers and leverage ideas from the declarative crowdsourcing\nliterature-including leveraging multiple prompting strategies, ensuring\ninternal consistency, and exploring hybrid-LLM-non-LLM approaches-to make\nprompt engineering a more principled process. Preliminary case studies on\nsorting, entity resolution, and imputation demonstrate the promise of our\napproach\n","authors":["Aditya G. Parameswaran","Shreya Shankar","Parth Asawa","Naman Jain","Yujie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03842v1","updated":"2023-08-07T18:00:04Z","published":"2023-08-07T18:00:04Z","title":"Search Engine and Recommendation System for the Music Industry built\n  with JinaAI","summary":"  One of the most intriguing debates regarding a novel task is the development\nof search engines and recommendation-based systems in the music industry.\nStudies have shown a drastic depression in the search engine fields, due to\nconcerning factors such as speed, accuracy and the format of data given for\nquerying. Often people face difficulty in searching for a song solely based on\nthe title, hence a solution is proposed to complete a search analysis through a\nsingle query input and is matched with the lyrics of the songs present in the\ndatabase. Hence it is essential to incorporate cutting-edge technology tools\nfor developing a user-friendly search engine. Jina AI is an MLOps framework for\nbuilding neural search engines that are utilized, in order for the user to\nobtain accurate results. Jina AI effectively helps to maintain and enhance the\nquality of performance for the search engine for the query given. An effective\nsearch engine and a recommendation system for the music industry, built with\nJinaAI.\n","authors":["Ishita Gopalakrishnan","Sanjjushri Varshini R","Ponshriharini V"],"pdf_url":"https://arxiv.org/pdf/2308.03842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03825v1","updated":"2023-08-07T16:55:20Z","published":"2023-08-07T16:55:20Z","title":"\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak\n  Prompts on Large Language Models","summary":"  The misuse of large language models (LLMs) has garnered significant attention\nfrom the general public and LLM vendors. In response, efforts have been made to\nalign LLMs with human values and intent use. However, a particular type of\nadversarial prompts, known as jailbreak prompt, has emerged and continuously\nevolved to bypass the safeguards and elicit harmful content from LLMs. In this\npaper, we conduct the first measurement study on jailbreak prompts in the wild,\nwith 6,387 prompts collected from four platforms over six months. Leveraging\nnatural language processing technologies and graph-based community detection\nmethods, we discover unique characteristics of jailbreak prompts and their\nmajor attack strategies, such as prompt injection and privilege escalation. We\nalso observe that jailbreak prompts increasingly shift from public platforms to\nprivate ones, posing new challenges for LLM vendors in proactive detection. To\nassess the potential harm caused by jailbreak prompts, we create a question set\ncomprising 46,800 samples across 13 forbidden scenarios. Our experiments show\nthat current LLMs and safeguards cannot adequately defend jailbreak prompts in\nall scenarios. Particularly, we identify two highly effective jailbreak prompts\nwhich achieve 0.99 attack success rates on ChatGPT (GPT-3.5) and GPT-4, and\nthey have persisted online for over 100 days. Our work sheds light on the\nsevere and evolving threat landscape of jailbreak prompts. We hope our study\ncan facilitate the research community and LLM vendors in promoting safer and\nregulated LLMs.\n","authors":["Xinyue Shen","Zeyuan Chen","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03821v1","updated":"2023-08-07T15:30:02Z","published":"2023-08-07T15:30:02Z","title":"Distributionally Robust Classification on a Data Budget","summary":"  Real world uses of deep learning require predictable model behavior under\ndistribution shifts. Models such as CLIP show emergent natural distributional\nrobustness comparable to humans, but may require hundreds of millions of\ntraining samples. Can we train robust learners in a domain where data is\nlimited? To rigorously address this question, we introduce JANuS (Joint\nAnnotations and Names Set), a collection of four new training datasets with\nimages, labels, and corresponding captions, and perform a series of carefully\ncontrolled investigations of factors contributing to robustness in image\nclassification, then compare those results to findings derived from a\nlarge-scale meta-analysis. Using this approach, we show that standard ResNet-50\ntrained with the cross-entropy loss on 2.4 million image samples can attain\ncomparable robustness to a CLIP ResNet-50 trained on 400 million samples. To\nour knowledge, this is the first result showing (near) state-of-the-art\ndistributional robustness on limited data budgets. Our dataset is available at\n\\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used\nto reproduce our experiments can be found at\n\\url{https://github.com/penfever/vlhub/}.\n","authors":["Benjamin Feuer","Ameya Joshi","Minh Pham","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.03821v1.pdf","comment":"TMLR 2023; openreview link:\n  https://openreview.net/forum?id=D5Z2E8CNsD"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.03703v1","updated":"2023-08-07T16:22:47Z","published":"2023-08-07T16:22:47Z","title":"Video-based Person Re-identification with Long Short-Term Representation\n  Learning","summary":"  Video-based person Re-Identification (V-ReID) aims to retrieve specific\npersons from raw videos captured by non-overlapped cameras. As a fundamental\ntask, it spreads many multimedia and computer vision applications. However, due\nto the variations of persons and scenes, there are still many obstacles that\nmust be overcome for high performance. In this work, we notice that both the\nlong-term and short-term information of persons are important for robust video\nrepresentations. Thus, we propose a novel deep learning framework named Long\nShort-Term Representation Learning (LSTRL) for effective V-ReID. More\nspecifically, to extract long-term representations, we propose a\nMulti-granularity Appearance Extractor (MAE), in which four granularity\nappearances are effectively captured across multiple frames. Meanwhile, to\nextract short-term representations, we propose a Bi-direction Motion Estimator\n(BME), in which reciprocal motion information is efficiently extracted from\nconsecutive frames. The MAE and BME are plug-and-play and can be easily\ninserted into existing networks for efficient feature learning. As a result,\nthey significantly improve the feature representation ability for V-ReID.\nExtensive experiments on three widely used benchmarks show that our proposed\napproach can deliver better performances than most state-of-the-arts.\n","authors":["Xuehu Liu","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03703v1.pdf","comment":"This work is accepted by ICIG2023, including 13 pages, 5 figures and\n  5 tables. Modifications may be performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03643v1","updated":"2023-08-07T14:47:45Z","published":"2023-08-07T14:47:45Z","title":"Mamba: Bringing Multi-Dimensional ABR to WebRTC","summary":"  Contemporary real-time video communication systems, such as WebRTC, use an\nadaptive bitrate (ABR) algorithm to assure high-quality and low-delay services,\ne.g., promptly adjusting video bitrate according to the instantaneous network\nbandwidth. However, target bitrate decisions in the network and bitrate control\nin the codec are typically incoordinated and simply ignoring the effect of\ninappropriate resolution and frame rate settings also leads to compromised\nresults in bitrate control, thus devastatingly deteriorating the quality of\nexperience (QoE). To tackle these challenges, Mamba, an end-to-end\nmulti-dimensional ABR algorithm is proposed, which utilizes multi-agent\nreinforcement learning (MARL) to maximize the user's QoE by adaptively and\ncollaboratively adjusting encoding factors including the quantization\nparameters (QP), resolution, and frame rate based on observed states such as\nnetwork conditions and video complexity information in a video conferencing\nsystem. We also introduce curriculum learning to improve the training\nefficiency of MARL. Both the in-lab and real-world evaluation results\ndemonstrate the remarkable efficacy of Mamba.\n","authors":["Yueheng Li","Zicheng Zhang","Hao Chen","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2308.03643v1.pdf","comment":"In Proceedings of the 31st ACM International Conference on\n  Multimedia, October 29-November 3, 2023, Ottawa, ON, Canada. ACM, New York,\n  NY, USA, 9 pages"},{"id":"http://arxiv.org/abs/2308.03475v1","updated":"2023-08-07T11:05:59Z","published":"2023-08-07T11:05:59Z","title":"COPA: Efficient Vision-Language Pre-training Through Collaborative\n  Object- and Patch-Text Alignment","summary":"  Vision-Language Pre-training (VLP) methods based on object detection enjoy\nthe rich knowledge of fine-grained object-text alignment but at the cost of\ncomputationally expensive inference. Recent Visual-Transformer (ViT)-based\napproaches circumvent this issue while struggling with long visual sequences\nwithout detailed cross-modal alignment information. This paper introduces a\nViT-based VLP technique that efficiently incorporates object information\nthrough a novel patch-text alignment mechanism. Specifically, we convert\nobject-level signals into patch-level ones and devise a Patch-Text Alignment\npre-training task (PTA) to learn a text-aware patch detector. By using\noff-the-shelf delicate object annotations in 5\\% training images, we jointly\ntrain PTA with other conventional VLP objectives in an end-to-end manner,\nbypassing the high computational cost of object detection and yielding an\neffective patch detector that accurately detects text-relevant patches, thus\nconsiderably reducing patch sequences and accelerating computation within the\nViT backbone. Our experiments on a variety of widely-used benchmarks reveal\nthat our method achieves a speedup of nearly 88\\% compared to prior VLP models\nwhile maintaining competitive or superior performance on downstream tasks with\nsimilar model size and data scale.\n","authors":["Chaoya Jiang","Haiyang Xu","Wei Ye","Qinghao Ye","Chenliang Li","Ming Yan","Bin Bi","Shikun Zhang","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03475v1.pdf","comment":"Accepted on ACM MM2023"},{"id":"http://arxiv.org/abs/2308.03463v1","updated":"2023-08-07T10:41:52Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.03463v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2308.03432v1","updated":"2023-08-07T09:26:36Z","published":"2023-08-07T09:26:36Z","title":"Cuing Without Sharing: A Federated Cued Speech Recognition Framework via\n  Mutual Knowledge Distillation","summary":"  Cued Speech (CS) is a visual coding tool to encode spoken languages at the\nphonetic level, which combines lip-reading and hand gestures to effectively\nassist communication among people with hearing impairments. The Automatic CS\nRecognition (ACSR) task aims to recognize CS videos into linguistic texts,\nwhich involves both lips and hands as two distinct modalities conveying\ncomplementary information. However, the traditional centralized training\napproach poses potential privacy risks due to the use of facial and gesture\nvideos in CS data. To address this issue, we propose a new Federated Cued\nSpeech Recognition (FedCSR) framework to train an ACSR model over the\ndecentralized CS data without sharing private information. In particular, a\nmutual knowledge distillation method is proposed to maintain cross-modal\nsemantic consistency of the Non-IID CS data, which ensures learning a unified\nfeature space for both linguistic and visual information. On the server side, a\nglobally shared linguistic model is trained to capture the long-term\ndependencies in the text sentences, which is aligned with the visual\ninformation from the local clients via visual-to-linguistic distillation. On\nthe client side, the visual model of each client is trained with its own local\ndata, assisted by linguistic-to-visual distillation treating the linguistic\nmodel as the teacher. To the best of our knowledge, this is the first approach\nto consider the federated ACSR task for privacy protection. Experimental\nresults on the Chinese CS dataset with multiple cuers demonstrate that our\napproach outperforms both mainstream federated learning baselines and existing\ncentralized state-of-the-art ACSR methods, achieving 9.7% performance\nimprovement for character error rate (CER) and 15.0% for word error rate (WER).\n","authors":["Yuxuan Zhang","Lei Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03950v1","updated":"2023-08-07T23:41:55Z","published":"2023-08-07T23:41:55Z","title":"Zero-shot Skeleton-based Action Recognition via Mutual Information\n  Estimation and Maximization","summary":"  Zero-shot skeleton-based action recognition aims to recognize actions of\nunseen categories after training on data of seen categories. The key is to\nbuild the connection between visual and semantic space from seen to unseen\nclasses. Previous studies have primarily focused on encoding sequences into a\nsingular feature vector, with subsequent mapping the features to an identical\nanchor point within the embedded space. Their performance is hindered by 1) the\nignorance of the global visual/semantic distribution alignment, which results\nin a limitation to capture the true interdependence between the two spaces. 2)\nthe negligence of temporal information since the frame-wise features with rich\naction clues are directly pooled into a single feature vector. We propose a new\nzero-shot skeleton-based action recognition method via mutual information (MI)\nestimation and maximization. Specifically, 1) we maximize the MI between visual\nand semantic space for distribution alignment; 2) we leverage the temporal\ninformation for estimating the MI by encouraging MI to increase as more frames\nare observed. Extensive experiments on three large-scale skeleton action\ndatasets confirm the effectiveness of our method. Code:\nhttps://github.com/YujieOuO/SMIE.\n","authors":["Yujie Zhou","Wenwen Qiang","Anyi Rao","Ning Lin","Bing Su","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03950v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03826v1","updated":"2023-08-07T17:49:04Z","published":"2023-08-07T17:49:04Z","title":"Recurrent Multi-scale Transformer for High-Resolution Salient Object\n  Detection","summary":"  Salient Object Detection (SOD) aims to identify and segment the most\nconspicuous objects in an image or video. As an important pre-processing step,\nit has many potential applications in multimedia and vision tasks. With the\nadvance of imaging devices, SOD with high-resolution images is of great demand,\nrecently. However, traditional SOD methods are largely limited to\nlow-resolution images, making them difficult to adapt to the development of\nHigh-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no\nlarge enough datasets for training and evaluating. Besides, current HRSOD\nmethods generally produce incomplete object regions and irregular object\nboundaries. To address above issues, in this work, we first propose a new\nHRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K\nresolution. As far as we know, it is the largest dataset for the HRSOD task,\nwhich will significantly help future works in training and evaluating models.\nFurthermore, to improve the HRSOD performance, we propose a novel Recurrent\nMulti-scale Transformer (RMFormer), which recurrently utilizes shared\nTransformers and multi-scale refinement architectures. Thus, high-resolution\nsaliency maps can be generated with the guidance of lower-resolution\npredictions. Extensive experiments on both high-resolution and low-resolution\nbenchmarks show the effectiveness and superiority of the proposed framework.\nThe source code and dataset are released at:\nhttps://github.com/DrowsyMon/RMFormer.\n","authors":["Xinhao Deng","Pingping Zhang","Wei Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03826v1.pdf","comment":"This work is accepted by ACM MM2023. More modifications may be\n  performed for further improvements"}]},"2023-08-08T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.04430v1","updated":"2023-08-08T17:58:15Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":"  The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v1.pdf","comment":"27 pages; 6 figures. Code, models, and data available at\n  https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2308.04424v1","updated":"2023-08-08T17:53:24Z","published":"2023-08-08T17:53:24Z","title":"A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment\n  Classification and Act Recognition","summary":"  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition\n(DAR) aims to predict the sentiment label and act label for each utterance in a\ndialog simultaneously. However, current methods encode the dialog context in\nonly one direction, which limits their ability to thoroughly comprehend the\ncontext. Moreover, these methods overlook the explicit correlations between\nsentiment and act labels, which leads to an insufficient ability to capture\nrich sentiment and act clues and hinders effective and accurate reasoning. To\naddress these issues, we propose a Bi-directional Multi-hop Inference Model\n(BMIM) that leverages a feature selection network and a bi-directional\nmulti-hop inference network to iteratively extract and integrate rich sentiment\nand act clues in a bi-directional manner. We also employ contrastive learning\nand dual learning to explicitly model the correlations of sentiment and act\nlabels. Our experiments on two widely-used datasets show that BMIM outperforms\nstate-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1\nscore in DSC. Additionally, Our proposed model not only improves the\nperformance but also enhances the interpretability of the joint sentiment and\nact prediction task.\n","authors":["Li Zheng","Fei Li","Yuyang Chai","Chong Teng","Donghong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15002v5","updated":"2023-08-08T17:39:57Z","published":"2023-07-27T16:57:32Z","title":"Gzip versus bag-of-words for text classification","summary":"  The effectiveness of compression in text classification ('gzip') has recently\ngarnered lots of attention. In this note we show that `bag-of-words' approaches\ncan achieve similar or better results, and are more efficient.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2307.15002v5.pdf","comment":"improved writing, extended with more results"},{"id":"http://arxiv.org/abs/2308.04398v1","updated":"2023-08-08T17:01:42Z","published":"2023-08-08T17:01:42Z","title":"Character-level NMT and language similarity","summary":"  We explore the effectiveness of character-level neural machine translation\nusing Transformer architecture for various levels of language similarity and\nsize of the training dataset on translation between Czech and Croatian, German,\nHungarian, Slovak, and Spanish. We evaluate the models using automatic MT\nmetrics and show that translation between similar languages benefits from\ncharacter-level input segmentation, while for less related languages,\ncharacter-level vanilla Transformer-base often lags behind subword-level\nsegmentation. We confirm previous findings that it is possible to close the gap\nby finetuning the already trained subword-level models to character-level.\n","authors":["Josef Jon","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2308.04398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04386v1","updated":"2023-08-08T16:41:16Z","published":"2023-08-08T16:41:16Z","title":"Learning Evaluation Models from Large Language Models for Sequence\n  Generation","summary":"  Large language models achieve state-of-the-art performance on sequence\ngeneration evaluation, but typically have a large number of parameters. This is\na computational challenge as presented by applying their evaluation capability\nat scale. To overcome the challenge, in this paper, we propose \\textbf{ECT}, an\n\\textbf{e}valuation \\textbf{c}apability \\textbf{t}ransfer method, to transfer\nthe evaluation capability from LLMs to relatively lightweight language models.\nBased on the proposed ECT, we learn various evaluation models from ChatGPT, and\nemploy them as reward models to improve sequence generation models via\nreinforcement learning and reranking approaches. Experimental results on\nmachine translation, text style transfer, and summarization tasks demonstrate\nthe effectiveness of our ECT. Notably, applying the learned evaluation models\nto sequence generation models results in better generated sequences as\nevaluated by commonly used metrics and ChatGPT.\n","authors":["Chenglong Wang","Hang Zhou","Kaiyan Chang","Tongran Liu","Chunliang Zhang","Quan Du","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.04386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06713v2","updated":"2023-08-08T16:21:49Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne"],"pdf_url":"https://arxiv.org/pdf/2307.06713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04346v1","updated":"2023-08-08T15:46:27Z","published":"2023-08-08T15:46:27Z","title":"Unmasking Nationality Bias: A Study of Human Perception of Nationalities\n  in AI-Generated Articles","summary":"  We investigate the potential for nationality biases in natural language\nprocessing (NLP) models using human evaluation methods. Biased NLP models can\nperpetuate stereotypes and lead to algorithmic discrimination, posing a\nsignificant challenge to the fairness and justice of AI systems. Our study\nemploys a two-step mixed-methods approach that includes both quantitative and\nqualitative analysis to identify and understand the impact of nationality bias\nin a text generation model. Through our human-centered quantitative analysis,\nwe measure the extent of nationality bias in articles generated by AI sources.\nWe then conduct open-ended interviews with participants, performing qualitative\ncoding and thematic analysis to understand the implications of these biases on\nhuman readers. Our findings reveal that biased NLP models tend to replicate and\namplify existing societal biases, which can translate to harm if used in a\nsociotechnical setting. The qualitative analysis from our interviews offers\ninsights into the experience readers have when encountering such articles,\nhighlighting the potential to shift a reader's perception of a country. These\nfindings emphasize the critical role of public perception in shaping AI's\nimpact on society and the need to correct biases in AI systems.\n","authors":["Pranav Narayanan Venkit","Sanjana Gautam","Ruchi Panchanadikar","Ting-Hao `Kenneth' Huang","Shomir Wilson"],"pdf_url":"https://arxiv.org/pdf/2308.04346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v2","updated":"2023-08-08T15:38:21Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v2.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2308.04333v1","updated":"2023-08-08T15:26:58Z","published":"2023-08-08T15:26:58Z","title":"Towards an AI to Win Ghana's National Science and Maths Quiz","summary":"  Can an AI win Ghana's National Science and Maths Quiz (NSMQ)? That is the\nquestion we seek to answer in the NSMQ AI project, an open-source project that\nis building AI to compete live in the NSMQ and win. The NSMQ is an annual live\nscience and mathematics competition for senior secondary school students in\nGhana in which 3 teams of 2 students compete by answering questions across\nbiology, chemistry, physics, and math in 5 rounds over 5 progressive stages\nuntil a winning team is crowned for that year. The NSMQ is an exciting live\nquiz competition with interesting technical challenges across speech-to-text,\ntext-to-speech, question-answering, and human-computer interaction. In this\nongoing work that began in January 2023, we give an overview of the project,\ndescribe each of the teams, progress made thus far, and the next steps toward\nour planned launch and debut of the AI in October for NSMQ 2023. An AI that\nconquers this grand challenge can have real-world impact on education such as\nenabling millions of students across Africa to have one-on-one learning support\nfrom this AI.\n","authors":["George Boateng","Jonathan Abrefah Mensah","Kevin Takyi Yeboah","William Edor","Andrew Kojo Mensah-Onumah","Naafi Dasana Ibrahim","Nana Sam Yeboah"],"pdf_url":"https://arxiv.org/pdf/2308.04333v1.pdf","comment":"7 pages. Under review at Deep Learning Indaba and Black in AI\n  Workshop @NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.04306v1","updated":"2023-08-08T14:51:16Z","published":"2023-08-08T14:51:16Z","title":"Deep Learning-Based Knowledge Injection for Metaphor Detection: A\n  Comprehensive Review","summary":"  The history of metaphor research also marks the evolution of knowledge\ninfusion research. With the continued advancement of deep learning techniques\nin recent years, the natural language processing community has shown great\ninterest in applying knowledge to successful results in metaphor recognition\ntasks. Although there has been a gradual increase in the number of approaches\ninvolving knowledge injection in the field of metaphor recognition, there is a\nlack of a complete review article on knowledge injection based approaches.\nTherefore, the goal of this paper is to provide a comprehensive review of\nresearch advances in the application of deep learning for knowledge injection\nin metaphor recognition tasks. In this paper, we systematically summarize and\ngeneralize the mainstream knowledge and knowledge injection principles, as well\nas review the datasets, evaluation metrics, and benchmark models used in\nmetaphor recognition tasks. Finally, we explore the current issues facing\nknowledge injection methods and provide an outlook on future research\ndirections.\n","authors":["Cheng Yang","Wenye Zhao","Qingbao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04306v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.04286v1","updated":"2023-08-08T14:29:35Z","published":"2023-08-08T14:29:35Z","title":"Comparative Analysis of the wav2vec 2.0 Feature Extractor","summary":"  Automatic speech recognition (ASR) systems typically use handcrafted feature\nextraction pipelines. To avoid their inherent information loss and to achieve\nmore consistent modeling from speech to transcribed text, neural raw waveform\nfeature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,\nwhich has recently gained large popularity, uses a convolutional FE which\noperates directly on the speech waveform. However, it is not yet studied\nextensively in the literature. In this work, we study its capability to replace\nthe standard feature extraction methods in a connectionist temporal\nclassification (CTC) ASR model and compare it to an alternative neural FE. We\nshow that both are competitive with traditional FEs on the LibriSpeech\nbenchmark and analyze the effect of the individual components. Furthermore, we\nanalyze the learned filters and show that the most important information for\nthe ASR system is obtained by a set of bandpass filters.\n","authors":["Peter Vieting","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2308.04286v1.pdf","comment":"Accepted at ITG 2023"},{"id":"http://arxiv.org/abs/2308.04275v1","updated":"2023-08-08T14:17:17Z","published":"2023-08-08T14:17:17Z","title":"In-Context Alignment: Chat with Vanilla Language Models Before\n  Fine-Tuning","summary":"  In this note, we explore inference-time alignment through in-context\nlearning. We consider a vanilla pretrained language model Llama-2 before any\nfine-tuning and retrieve an average of 9 demonstration alignment examples when\nthe model is prompted to follow chat-style instructions. Compared to direct\nprompting, the in-context alignment without changing model weights leads to a\n7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making\nthe vanilla language model comparable to strong baselines with alignment\nfine-tuning.\n","authors":["Xiaochuang Han"],"pdf_url":"https://arxiv.org/pdf/2308.04275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04255v1","updated":"2023-08-08T13:41:41Z","published":"2023-08-08T13:41:41Z","title":"CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic\n  Languages","summary":"  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of\nthe South Slavic languages, which is based on the Stanza natural language\nprocessing pipeline. We describe the main improvements in CLASSLA-Stanza with\nrespect to Stanza, and give a detailed description of the model training\nprocess for the latest 2.1 release of the pipeline. We also report performance\nscores produced by the pipeline for different languages and varieties.\nCLASSLA-Stanza exhibits consistently high performance across all the supported\nlanguages and outperforms or expands its parent pipeline Stanza at all the\nsupported tasks. We also present the pipeline's new functionality enabling\nefficient processing of web data and the reasons that led to its\nimplementation.\n","authors":["Luka Terčon","Nikola Ljubešić"],"pdf_url":"https://arxiv.org/pdf/2308.04255v1.pdf","comment":"17 pages, 14 tables, 1 figure"},{"id":"http://arxiv.org/abs/2302.03512v3","updated":"2023-08-08T13:27:29Z","published":"2023-02-07T14:56:52Z","title":"A Survey on Arabic Named Entity Recognition: Past, Recent Advances, and\n  Future Trends","summary":"  As more and more Arabic texts emerged on the Internet, extracting important\ninformation from these Arabic texts is especially useful. As a fundamental\ntechnology, Named entity recognition (NER) serves as the core component in\ninformation extraction technology, while also playing a critical role in many\nother Natural Language Processing (NLP) systems, such as question answering and\nknowledge graph building. In this paper, we provide a comprehensive review of\nthe development of Arabic NER, especially the recent advances in deep learning\nand pre-trained language model. Specifically, we first introduce the background\nof Arabic NER, including the characteristics of Arabic and existing resources\nfor Arabic NER. Then, we systematically review the development of Arabic NER\nmethods. Traditional Arabic NER systems focus on feature engineering and\ndesigning domain-specific rules. In recent years, deep learning methods achieve\nsignificant progress by representing texts via continuous vector\nrepresentations. With the growth of pre-trained language model, Arabic NER\nyields better performance. Finally, we conclude the method gap between Arabic\nNER and NER methods from other languages, which helps outline future directions\nfor Arabic NER.\n","authors":["Xiaoye Qu","Yingjie Gu","Qingrong Xia","Zechang Li","Zhefeng Wang","Baoxing Huai"],"pdf_url":"https://arxiv.org/pdf/2302.03512v3.pdf","comment":"Accepted by IEEE TKDE"},{"id":"http://arxiv.org/abs/2308.04248v1","updated":"2023-08-08T13:26:53Z","published":"2023-08-08T13:26:53Z","title":"Gloss Alignment Using Word Embeddings","summary":"  Capturing and annotating Sign language datasets is a time consuming and\ncostly process. Current datasets are orders of magnitude too small to\nsuccessfully train unconstrained \\acf{slt} models. As a result, research has\nturned to TV broadcast content as a source of large-scale training data,\nconsisting of both the sign language interpreter and the associated audio\nsubtitle. However, lack of sign language annotation limits the usability of\nthis data and has led to the development of automatic annotation techniques\nsuch as sign spotting. These spottings are aligned to the video rather than the\nsubtitle, which often results in a misalignment between the subtitle and\nspotted signs. In this paper we propose a method for aligning spottings with\ntheir corresponding subtitles using large spoken language models. Using a\nsingle modality means our method is computationally inexpensive and can be\nutilized in conjunction with existing alignment techniques. We quantitatively\ndemonstrate the effectiveness of our method on the \\acf{mdgs} and \\acf{bobsl}\ndatasets, recovering up to a 33.22 BLEU-1 score in word alignment.\n","authors":["Harry Walsh","Ozge Mercanoglu Sincan","Ben Saunders","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.04248v1.pdf","comment":"4 pages, 4 figures, 2023 IEEE International Conference on Acoustics,\n  Speech, and Signal Processing Workshops (ICASSPW)"},{"id":"http://arxiv.org/abs/2306.09841v3","updated":"2023-08-08T12:57:18Z","published":"2023-06-16T13:39:35Z","title":"Are Large Language Models Really Good Logical Reasoners? A Comprehensive\n  Evaluation and Beyond","summary":"  Logical reasoning consistently plays a fundamental and significant role in\nthe domains of knowledge engineering and artificial intelligence. Recently,\nLarge Language Models (LLMs) have emerged as a noteworthy innovation in natural\nlanguage processing (NLP), exhibiting impressive achievements across various\nclassic NLP tasks. However, the question of whether LLMs can effectively\naddress the task of logical reasoning, which requires gradual cognitive\ninference similar to human intelligence, remains unanswered. To this end, we\naim to bridge this gap and provide comprehensive evaluations in this paper.\nFirstly, to offer systematic evaluations, we select fifteen typical logical\nreasoning datasets and organize them into deductive, inductive, abductive and\nmixed-form reasoning settings. Considering the comprehensiveness of\nevaluations, we include three representative LLMs (i.e., text-davinci-003,\nChatGPT and BARD) and evaluate them on all selected datasets under zero-shot,\none-shot and three-shot settings. Secondly, different from previous evaluations\nrelying only on simple metrics (e.g., accuracy), we propose fine-level\nevaluations from objective and subjective manners, covering both answers and\nexplanations. Additionally, to uncover the logical flaws of LLMs, problematic\ncases will be attributed to five error types from two dimensions, i.e.,\nevidence selection process and reasoning process. Thirdly, to avoid the\ninfluences of knowledge bias and purely focus on benchmarking the logical\nreasoning capability of LLMs, we propose a new dataset with neutral content. It\ncontains 3,000 samples and covers deductive, inductive and abductive settings.\nBased on the in-depth evaluations, this paper finally forms a general\nevaluation scheme of logical reasoning capability from six dimensions. It\nreflects the pros and cons of LLMs and gives guiding directions for future\nworks.\n","authors":["Fangzhi Xu","Qika Lin","Jiawei Han","Tianzhe Zhao","Jun Liu","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2306.09841v3.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.04215v1","updated":"2023-08-08T12:27:20Z","published":"2023-08-08T12:27:20Z","title":"Hybrid Retrieval-Augmented Generation for Real-time Composition\n  Assistance","summary":"  Retrieval augmented models show promise in enhancing traditional language\nmodels by improving their contextual understanding, integrating private data,\nand reducing hallucination. However, the processing time required for retrieval\naugmented large language models poses a challenge when applying them to tasks\nthat require real-time responses, such as composition assistance.\n  To overcome this limitation, we propose the Hybrid Retrieval-Augmented\nGeneration (HybridRAG) framework that leverages a hybrid setting that combines\nboth client and cloud models. HybridRAG incorporates retrieval-augmented memory\ngenerated asynchronously by a Large Language Model (LLM) in the cloud. By\nintegrating this retrieval augmented memory, the client model acquires the\ncapability to generate highly effective responses, benefiting from the LLM's\ncapabilities. Furthermore, through asynchronous memory integration, the client\nmodel is capable of delivering real-time responses to user requests without the\nneed to wait for memory synchronization from the cloud. Our experiments on\nWikitext and Pile subsets show that HybridRAG achieves lower latency than a\ncloud-based retrieval-augmented LLM, while outperforming client-only models in\nutility.\n","authors":["Xuchao Zhang","Menglin Xia","Camille Couturier","Guoqing Zheng","Saravan Rajmohan","Victor Ruhle"],"pdf_url":"https://arxiv.org/pdf/2308.04215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09998v3","updated":"2023-08-08T12:23:49Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":"  The derivation of mathematical results in specialised fields, using Large\nLanguage Models (LLMs), is an emerging research direction that can help\nidentify models' limitations, and potentially support mathematical discovery.\nIn this paper, we leverage a symbolic engine to generate derivations of\nequations at scale, and investigate the capabilities of LLMs when deriving goal\nequations from premises. Specifically, we employ in-context learning for GPT\nand fine-tune a range of T5 models to compare the robustness and generalisation\nof pre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in conventional scores. However, an in-depth\nanalysis reveals that the fine-tuned models are more sensitive to perturbations\ninvolving unseen symbols and (to a lesser extent) changes to equation\nstructure. In addition, we analyse 1.7K equations, and over 200 derivations, to\nhighlight common reasoning errors such as the inclusion of incorrect,\nirrelevant, and redundant equations. Finally, we explore the suitability of\nexisting metrics for evaluating mathematical derivations and find evidence\nthat, while they can capture general properties such as sensitivity to\nperturbations, they fail to highlight fine-grained reasoning errors and\nessential differences between models. Overall, this work demonstrates that\ntraining models on synthetic data may improve their math capabilities beyond\nmuch larger LLMs, but current metrics are not appropriately assessing the\nquality of generated mathematical text.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.03565v2","updated":"2023-08-08T12:12:55Z","published":"2023-08-07T13:16:42Z","title":"Topological Interpretations of GPT-3","summary":"  This is an experiential study of investigating a consistent method for\nderiving the correlation between sentence vector and semantic meaning of a\nsentence. We first used three state-of-the-art word/sentence embedding methods\nincluding GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence\nstrings into high dimensional spaces. Then we compute the pairwise distance\nbetween any possible combination of two sentence vectors in an embedding space\nand map them into a matrix. Based on each distance matrix, we compute the\ncorrelation of distances of a sentence vector with respect to the other\nsentence vectors in an embedding space. Then we compute the correlation of each\npair of the distance matrices. We observed correlations of the same sentence in\ndifferent embedding spaces and correlations of different sentences in the same\nembedding space. These observations are consistent with our hypothesis and take\nus to the next stage.\n","authors":["Tianyi Sun","Bradley Nelson"],"pdf_url":"https://arxiv.org/pdf/2308.03565v2.pdf","comment":"70 pages"},{"id":"http://arxiv.org/abs/2305.10652v2","updated":"2023-08-08T11:10:32Z","published":"2023-05-18T02:19:05Z","title":"Speech Separation based on Contrastive Learning and Deep Modularization","summary":"  The current monaural state of the art tools for speech separation relies on\nsupervised learning. This means that they must deal with permutation problem,\nthey are impacted by the mismatch on the number of speakers used in training\nand inference. Moreover, their performance heavily relies on the presence of\nhigh-quality labelled data. These problems can be effectively addressed by\nemploying a fully unsupervised technique for speech separation. In this paper,\nwe use contrastive learning to establish the representations of frames then use\nthe learned representations in the downstream deep modularization task.\nConcretely, we demonstrate experimentally that in speech separation, different\nframes of a speaker can be viewed as augmentations of a given hidden standard\nframe of that speaker. The frames of a speaker contain enough prosodic\ninformation overlap which is key in speech separation. Based on this, we\nimplement a self-supervised learning to learn to minimize the distance between\nframes belonging to a given speaker. The learned representations are used in a\ndownstream deep modularization task to cluster frames based on speaker\nidentity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix\nshows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively\nin WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7\nrespectively in WSJ0-2mix. Its greatest strength being that as the number of\nspeakers increase, its performance does not degrade significantly.\n","authors":["Peter Ochieng"],"pdf_url":"https://arxiv.org/pdf/2305.10652v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2212.00369"},{"id":"http://arxiv.org/abs/2308.04180v1","updated":"2023-08-08T10:42:33Z","published":"2023-08-08T10:42:33Z","title":"Studying Socially Unacceptable Discourse Classification (SUD) through\n  different eyes: \"Are we on the same page ?\"","summary":"  We study Socially Unacceptable Discourse (SUD) characterization and detection\nin online text. We first build and present a novel corpus that contains a large\nvariety of manually annotated texts from different online sources used so far\nin state-of-the-art Machine learning (ML) SUD detection solutions. This global\ncontext allows us to test the generalization ability of SUD classifiers that\nacquire knowledge around the same SUD categories, but from different contexts.\nFrom this perspective, we can analyze how (possibly) different annotation\nmodalities influence SUD learning by discussing open challenges and open\nresearch directions. We also provide several data insights which can support\ndomain experts in the annotation task.\n","authors":["Bruno Machado Carneiro","Michele Linardi","Julien Longhi"],"pdf_url":"https://arxiv.org/pdf/2308.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04176v1","updated":"2023-08-08T10:23:04Z","published":"2023-08-08T10:23:04Z","title":"On Monotonic Aggregation for Open-domain QA","summary":"  Question answering (QA) is a critical task for speech-based retrieval from\nknowledge sources, by sifting only the answers without requiring to read\nsupporting documents. Specifically, open-domain QA aims to answer user\nquestions on unrestricted knowledge sources. Ideally, adding a source should\nnot decrease the accuracy, but we find this property (denoted as\n\"monotonicity\") does not hold for current state-of-the-art methods. We identify\nthe cause, and based on that we propose Judge-Specialist framework. Our\nframework consists of (1) specialist retrievers/readers to cover individual\nsources, and (2) judge, a dedicated language model to select the final answer.\nOur experiments show that our framework not only ensures monotonicity, but also\noutperforms state-of-the-art multi-source QA methods on Natural Questions.\nAdditionally, we show that our models robustly preserve the monotonicity\nagainst noise from speech recognition. We publicly release our code and\nsetting.\n","authors":["Sang-eun Han","Yeonseok Jeong","Seung-won Hwang","Kyungjae Lee"],"pdf_url":"https://arxiv.org/pdf/2308.04176v1.pdf","comment":"INTERSPEECH 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2306.02864v2","updated":"2023-08-08T09:48:36Z","published":"2023-06-05T13:35:01Z","title":"Leveraging Large Language Models for Topic Classification in the Domain\n  of Public Affairs","summary":"  The analysis of public affairs documents is crucial for citizens as it\npromotes transparency, accountability, and informed decision-making. It allows\ncitizens to understand government policies, participate in public discourse,\nand hold representatives accountable. This is crucial, and sometimes a matter\nof life or death, for companies whose operation depend on certain regulations.\nLarge Language Models (LLMs) have the potential to greatly enhance the analysis\nof public affairs documents by effectively processing and understanding the\ncomplex language used in such documents. In this work, we analyze the\nperformance of LLMs in classifying public affairs documents. As a natural\nmulti-label task, the classification of these documents presents important\nchallenges. In this work, we use a regex-powered tool to collect a database of\npublic affairs documents with more than 33K samples and 22.5M tokens. Our\nexperiments assess the performance of 4 different Spanish LLMs to classify up\nto 30 different topics in the data in different configurations. The results\nshows that LLMs can be of great use to process domain-specific documents, such\nas those in the domain of public affairs.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Ignacio Serna","Javier Ortega-Garcia","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.02864v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Automatic Domain-Adapted and\n  Personalized Document Analysis"},{"id":"http://arxiv.org/abs/2308.02582v2","updated":"2023-08-08T08:57:20Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v2.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2308.04138v1","updated":"2023-08-08T08:57:01Z","published":"2023-08-08T08:57:01Z","title":"Large Language Model Prompt Chaining for Long Legal Document\n  Classification","summary":"  Prompting is used to guide or steer a language model in generating an\nappropriate response that is consistent with the desired outcome. Chaining is a\nstrategy used to decompose complex tasks into smaller, manageable components.\nIn this study, we utilize prompt chaining for extensive legal document\nclassification tasks, which present difficulties due to their intricate\ndomain-specific language and considerable length. Our approach begins with the\ncreation of a concise summary of the original document, followed by a semantic\nsearch for related exemplar texts and their corresponding annotations from a\ntraining corpus. Finally, we prompt for a label - based on the task - to\nassign, by leveraging the in-context learning from the few-shot prompt. We\ndemonstrate that through prompt chaining, we can not only enhance the\nperformance over zero-shot, but also surpass the micro-F1 score achieved by\nlarger models, such as ChatGPT zero-shot, using smaller models.\n","authors":["Dietrich Trautmann"],"pdf_url":"https://arxiv.org/pdf/2308.04138v1.pdf","comment":"SwissText 2023 Late Breaking Work (Generative AI & LLM)"},{"id":"http://arxiv.org/abs/2308.04124v1","updated":"2023-08-08T08:27:57Z","published":"2023-08-08T08:27:57Z","title":"Social Media, Topic Modeling and Sentiment Analysis in Municipal\n  Decision Support","summary":"  Many cities around the world are aspiring to become. However, smart\ninitiatives often give little weight to the opinions of average citizens.\n  Social media are one of the most important sources of citizen opinions. This\npaper presents a prototype of a framework for processing social media posts\nwith municipal decision-making in mind. The framework consists of a sequence of\nthree steps: (1) determining the sentiment polarity of each social media post\n(2) identifying prevalent topics and mapping these topics to individual posts,\nand (3) aggregating these two pieces of information into a fuzzy number\nrepresenting the overall sentiment expressed towards each topic. Optionally,\nthe fuzzy number can be reduced into a tuple of two real numbers indicating the\n\"amount\" of positive and negative opinion expressed towards each topic.\n  The framework is demonstrated on tweets published from Ostrava, Czechia over\na period of about two months. This application illustrates how fuzzy numbers\nrepresent sentiment in a richer way and capture the diversity of opinions\nexpressed on social media.\n","authors":["Miloš Švaňa"],"pdf_url":"https://arxiv.org/pdf/2308.04124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07748v4","updated":"2023-08-08T08:08:12Z","published":"2023-02-15T15:54:01Z","title":"Whats New? Identifying the Unfolding of New Events in Narratives","summary":"  Narratives include a rich source of events unfolding over time and context.\nAutomatic understanding of these events provides a summarised comprehension of\nthe narrative for further computation (such as reasoning). In this paper, we\nstudy the Information Status (IS) of the events and propose a novel challenging\ntask: the automatic identification of new events in a narrative. We define an\nevent as a triplet of subject, predicate, and object. The event is categorized\nas new with respect to the discourse context and whether it can be inferred\nthrough commonsense reasoning. We annotated a publicly available corpus of\nnarratives with the new events at sentence level using human annotators. We\npresent the annotation protocol and study the quality of the annotation and the\ndifficulty of the task. We publish the annotated dataset, annotation materials,\nand machine learning baseline models for the task of new event extraction for\nnarrative understanding.\n","authors":["Seyed Mahed Mousavi","Shohei Tanaka","Gabriel Roccabruna","Koichiro Yoshino","Satoshi Nakamura","Giuseppe Riccardi"],"pdf_url":"https://arxiv.org/pdf/2302.07748v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04114v1","updated":"2023-08-08T08:00:52Z","published":"2023-08-08T08:00:52Z","title":"Collective Human Opinions in Semantic Textual Similarity","summary":"  Despite the subjective nature of semantic textual similarity (STS) and\npervasive disagreements in STS annotation, existing benchmarks have used\naveraged human ratings as the gold standard. Averaging masks the true\ndistribution of human opinions on examples of low agreement, and prevents\nmodels from capturing the semantic vagueness that the individual ratings\nrepresent. In this work, we introduce USTS, the first Uncertainty-aware STS\ndataset with ~15,000 Chinese sentence pairs and 150,000 labels, to study\ncollective human opinions in STS. Analysis reveals that neither a scalar nor a\nsingle Gaussian fits a set of observed judgements adequately. We further show\nthat current STS models cannot capture the variance caused by human\ndisagreement on individual instances, but rather reflect the predictive\nconfidence over the aggregate dataset.\n","authors":["Yuxia Wang","Shimin Tao","Ning Xie","Hao Yang","Timothy Baldwin","Karin Verspoor"],"pdf_url":"https://arxiv.org/pdf/2308.04114v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.03421v2","updated":"2023-08-08T07:58:06Z","published":"2023-08-07T09:14:33Z","title":"RecycleGPT: An Autoregressive Language Model with Recyclable Module","summary":"  Existing large language models have to run K times to generate a sequence of\nK tokens. In this paper, we present RecycleGPT, a generative language model\nwith fast decoding speed by recycling pre-generated model states without\nrunning the whole model in multiple steps. Our approach relies on the\nobservation that adjacent tokens in a sequence usually have strong correlations\nand the next token in a sequence can be reasonably guessed or inferred based on\nthe preceding ones. Experiments and analysis demonstrate the effectiveness of\nour approach in lowering inference latency, achieving up to 1.4x speedup while\npreserving high performance.\n","authors":["Yufan Jiang","Qiaozhi He","Xiaomin Zhuang","Zhihua Wu","Kunpeng Wang","Wenlai Zhao","Guangwen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03421v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.04109v1","updated":"2023-08-08T07:47:10Z","published":"2023-08-08T07:47:10Z","title":"I-WAS: a Data Augmentation Method with GPT-2 for Simile Detection","summary":"  Simile detection is a valuable task for many natural language processing\n(NLP)-based applications, particularly in the field of literature. However,\nexisting research on simile detection often relies on corpora that are limited\nin size and do not adequately represent the full range of simile forms. To\naddress this issue, we propose a simile data augmentation method based on\n\\textbf{W}ord replacement And Sentence completion using the GPT-2 language\nmodel. Our iterative process called I-WAS, is designed to improve the quality\nof the augmented sentences. To better evaluate the performance of our method in\nreal-world applications, we have compiled a corpus containing a more diverse\nset of simile forms for experimentation. Our experimental results demonstrate\nthe effectiveness of our proposed data augmentation method for simile\ndetection.\n","authors":["Yongzhu Chang","Rongsheng Zhang","Jiashu Pu"],"pdf_url":"https://arxiv.org/pdf/2308.04109v1.pdf","comment":"15 pages, 1 figure"},{"id":"http://arxiv.org/abs/2201.05337v4","updated":"2023-08-08T06:50:57Z","published":"2022-01-14T08:32:20Z","title":"A Survey of Controllable Text Generation using Transformer-based\n  Pre-trained Language Models","summary":"  Controllable Text Generation (CTG) is emerging area in the field of natural\nlanguage generation (NLG). It is regarded as crucial for the development of\nadvanced text generation technologies that better meet the specific constraints\nin practical applications. In recent years, methods using large-scale\npre-trained language models (PLMs), in particular the widely used\ntransformer-based PLMs, have become a new paradigm of NLG, allowing generation\nof more diverse and fluent text. However, due to the limited level of\ninterpretability of deep neural networks, the controllability of these methods\nneed to be guaranteed. To this end, controllable text generation using\ntransformer-based PLMs has become a rapidly growing yet challenging new\nresearch hotspot. A diverse range of approaches have emerged in the recent 3-4\nyears, targeting different CTG tasks that require different types of controlled\nconstraints. In this paper, we present a systematic critical review on the\ncommon tasks, main approaches, and evaluation methods in this area. Finally, we\ndiscuss the challenges that the field is facing, and put forward various\npromising future directions. To the best of our knowledge, this is the first\nsurvey paper to summarize the state-of-the-art CTG techniques from the\nperspective of Transformer-based PLMs. We hope it can help researchers and\npractitioners in the related fields to quickly track the academic and\ntechnological frontier, providing them with a landscape of the area and a\nroadmap for future research.\n","authors":["Hanqing Zhang","Haolin Song","Shaoyu Li","Ming Zhou","Dawei Song"],"pdf_url":"https://arxiv.org/pdf/2201.05337v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04076v1","updated":"2023-08-08T06:21:58Z","published":"2023-08-08T06:21:58Z","title":"DataTales: Investigating the use of Large Language Models for Authoring\n  Data-Driven Articles","summary":"  Authoring data-driven articles is a complex process requiring authors to not\nonly analyze data for insights but also craft a cohesive narrative that\neffectively communicates the insights. Text generation capabilities of\ncontemporary large language models (LLMs) present an opportunity to assist the\nauthoring of data-driven articles and expedite the writing process. In this\nwork, we investigate the feasibility and perceived value of leveraging LLMs to\nsupport authors of data-driven articles. We designed a prototype system,\nDataTales, that leverages a LLM to generate textual narratives accompanying a\ngiven chart. Using DataTales as a design probe, we conducted a qualitative\nstudy with 11 professionals to evaluate the concept, from which we distilled\naffordances and opportunities to further integrate LLMs as valuable data-driven\narticle authoring assistants.\n","authors":["Nicole Sultanum","Arjun Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2308.04076v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2308.04041v1","updated":"2023-08-08T04:37:41Z","published":"2023-08-08T04:37:41Z","title":"InfeRE: Step-by-Step Regex Generation via Chain of Inference","summary":"  Automatically generating regular expressions (abbrev. regexes) from natural\nlanguage description (NL2RE) has been an emerging research area. Prior studies\ntreat regex as a linear sequence of tokens and generate the final expressions\nautoregressively in a single pass. They did not take into account the\nstep-by-step internal text-matching processes behind the final results. This\nsignificantly hinders the efficacy and interpretability of regex generation by\nneural language models. In this paper, we propose a new paradigm called InfeRE,\nwhich decomposes the generation of regexes into chains of step-by-step\ninference. To enhance the robustness, we introduce a self-consistency decoding\nmechanism that ensembles multiple outputs sampled from different models. We\nevaluate InfeRE on two publicly available datasets, NL-RX-Turk and KB13, and\ncompare the results with state-of-the-art approaches and the popular tree-based\ngeneration approach TRANX. Experimental results show that InfeRE substantially\noutperforms previous baselines, yielding 16.3% and 14.7% improvement in DFA@5\naccuracy on two datasets, respectively. Particularly, InfeRE outperforms the\npopular tree-based generation approach by 18.1% and 11.3% on both datasets,\nrespectively, in terms of DFA@5 accuracy.\n","authors":["Shuai Zhang","Xiaodong Gu","Yuting Chen","Beijun Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04041v1.pdf","comment":"This paper has been accepted by ASE'23"},{"id":"http://arxiv.org/abs/2308.04037v1","updated":"2023-08-08T04:27:34Z","published":"2023-08-08T04:27:34Z","title":"A Comparative Study on TF-IDF feature Weighting Method and its Analysis\n  using Unstructured Dataset","summary":"  Text Classification is the process of categorizing text into the relevant\ncategories and its algorithms are at the core of many Natural Language\nProcessing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP\nare the most highly used information retrieval methods in text classification.\nWe have investigated and analyzed the feature weighting method for text\nclassification on unstructured data. The proposed model considered two features\nN-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset\nfor sentiment analysis. Then we have used the state-of-the-art classifier to\nvalidate the method i.e., Support Vector Machine (SVM), Logistic Regression,\nMultinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and\nk-nearest neighbors (KNN). From those two feature extractions, a significant\nincrease in feature extraction with TF-IDF features rather than based on\nN-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall\n(93.81%), and F1-score (91.99%) value in Random Forest classifier.\n","authors":["Mamata Das","Selvakumar K.","P. J. A. Alphonse"],"pdf_url":"https://arxiv.org/pdf/2308.04037v1.pdf","comment":"10 pages, 3 figures, COLINS-2021, 5th International Conference on\n  Computational Linguistics and Intelligent Systems, April 22-23, 2021,\n  Kharkiv, Ukraine"},{"id":"http://arxiv.org/abs/2307.10457v3","updated":"2023-08-08T04:18:34Z","published":"2023-07-19T21:00:16Z","title":"Improving the Reusability of Pre-trained Language Models in Real-world\n  Applications","summary":"  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is\noften limited by their generalization problem, where their performance\ndrastically decreases when evaluated on examples that differ from the training\ndataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation\narises from PLMs' reliance on spurious correlations, which work well for\nfrequent example types but not for general examples. To address this issue, we\npropose a training approach called Mask-tuning, which integrates Masked\nLanguage Modeling (MLM) training objectives into the fine-tuning process to\nenhance PLMs' generalization. Comprehensive experiments demonstrate that\nMask-tuning surpasses current state-of-the-art techniques and enhances PLMs'\ngeneralization on OOD datasets while improving their performance on\nin-distribution datasets. The findings suggest that Mask-tuning improves the\nreusability of PLMs on unseen data, making them more practical and effective\nfor real-world applications.\n","authors":["Somayeh Ghanbarzadeh","Hamid Palangi","Yan Huang","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10457v3.pdf","comment":"Accepted as a long paper and awarded as the BEST Resaerch Paper in\n  IEEE IRI'23 (IEEE 24th International conference on Information Reuse and\n  Integrationfor Data Science)"},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2306.07848v6","updated":"2023-08-08T03:41:47Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining approaches have\nrecently exhibited impressive success in diverse fields. In this paper, we\npropose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive\nlanguage-audio pretraining (CLAP) method for speech emotion recognition.\nSpecifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing\npre-trained WavLM and RoBERTa models. Second, given the significance of the\ngender attribute in speech emotion modeling, two novel soft label based\nGEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)\nmodels are further proposed to integrate emotion and gender information of\nspeech signals, forming more reasonable objectives. Extensive experiments on\nIEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the\nbaseline Emo-CLAP, while also achieving the best recognition performance\ncompared with recent state-of-the-art methods. Noticeably, the proposed\nSL-GEmo-CLAP model achieves the best UAR of 81.43\\% and WAR of 83.16\\% which\nperforms better than other state-of-the-art SER methods by at least 3\\%.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Jixun Yao","Wen Fei","Lei Ma","Heng Lu"],"pdf_url":"https://arxiv.org/pdf/2306.07848v6.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.01681v2","updated":"2023-08-08T03:19:10Z","published":"2023-08-03T10:48:30Z","title":"NBIAS: A Natural Language Processing Framework for Bias Identification\n  in Text","summary":"  Bias in textual data can lead to skewed interpretations and outcomes when the\ndata is used. These biases could perpetuate stereotypes, discrimination, or\nother forms of unfair treatment. An algorithm trained on biased data ends up\nmaking decisions that disproportionately impact a certain group of people.\nTherefore, it is crucial to detect and remove these biases to ensure the fair\nand ethical use of data. To this end, we develop a comprehensive and robust\nframework \\textsc{Nbias} that consists of a data layer, corpus contruction,\nmodel development layer and an evaluation layer. The dataset is constructed by\ncollecting diverse data from various fields, including social media,\nhealthcare, and job hiring portals. As such, we applied a transformer-based\ntoken classification model that is able to identify bias words/ phrases through\na unique named entity. In the assessment procedure, we incorporate a blend of\nquantitative and qualitative evaluations to gauge the effectiveness of our\nmodels. We achieve accuracy improvements ranging from 1% to 8% compared to\nbaselines. We are also able to generate a robust understanding of the model\nfunctioning, capturing not only numerical data but also the quality and\nintricacies of its performance. The proposed approach is applicable to a\nvariety of biases and contributes to the fair and ethical use of textual data.\n","authors":["Shaina Raza","Muskan Garg","Deepak John Reji","Syed Raza Bashir","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.01681v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.04014v1","updated":"2023-08-08T03:18:18Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n  model?","summary":"  Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03131v2","updated":"2023-08-08T02:01:14Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n  Reference Diversity in NLG Evaluation","summary":"  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.03131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03983v1","updated":"2023-08-08T02:00:43Z","published":"2023-08-08T02:00:43Z","title":"SimplyRetrieve: A Private and Lightweight Retrieval-Centric Generative\n  AI Tool","summary":"  Large Language Model (LLM) based Generative AI systems have seen significant\nprogress in recent years. Integrating a knowledge retrieval architecture allows\nfor seamless integration of private data into publicly available Generative AI\nsystems using pre-trained LLM without requiring additional model fine-tuning.\nMoreover, Retrieval-Centric Generation (RCG) approach, a promising future\nresearch direction that explicitly separates roles of LLMs and retrievers in\ncontext interpretation and knowledge memorization, potentially leads to more\nefficient implementation. SimplyRetrieve is an open-source tool with the goal\nof providing a localized, lightweight, and user-friendly interface to these\nsophisticated advancements to the machine learning community. SimplyRetrieve\nfeatures a GUI and API based RCG platform, assisted by a Private Knowledge Base\nConstructor and a Retrieval Tuning Module. By leveraging these capabilities,\nusers can explore the potential of RCG for improving generative AI performance\nwhile maintaining privacy standards. The tool is available at\nhttps://github.com/RCGAI/SimplyRetrieve with an MIT license.\n","authors":["Youyang Ng","Daisuke Miyashita","Yasuto Hoshi","Yasuhiro Morioka","Osamu Torii","Tomoya Kodama","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2308.03983v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04625v1","updated":"2023-08-08T23:31:10Z","published":"2023-08-08T23:31:10Z","title":"A Comparative Study of Sentence Embedding Models for Assessing Semantic\n  Variation","summary":"  Analyzing the pattern of semantic variation in long real-world texts such as\nbooks or transcripts is interesting from the stylistic, cognitive, and\nlinguistic perspectives. It is also useful for applications such as text\nsegmentation, document summarization, and detection of semantic novelty. The\nrecent emergence of several vector-space methods for sentence embedding has\nmade such analysis feasible. However, this raises the issue of how consistent\nand meaningful the semantic representations produced by various methods are in\nthemselves. In this paper, we compare several recent sentence embedding methods\nvia time-series of semantic similarity between successive sentences and\nmatrices of pairwise sentence similarity for multiple books of literature. In\ncontrast to previous work using target tasks and curated datasets to compare\nsentence embedding methods, our approach provides an evaluation of the methods\n'in the wild'. We find that most of the sentence embedding methods considered\ndo infer highly correlated patterns of semantic similarity in a given document,\nbut show interesting differences.\n","authors":["Deven M. Mistry","Ali A. Minai"],"pdf_url":"https://arxiv.org/pdf/2308.04625v1.pdf","comment":"12 pages, 6 figures, Accepted for publication in the Proceedings of\n  the 2023 International Conference on Artificial Neural Networks, Heraklion,,\n  Greece, September 26-29, 2023"},{"id":"http://arxiv.org/abs/2308.04624v1","updated":"2023-08-08T23:30:20Z","published":"2023-08-08T23:30:20Z","title":"Benchmarking LLM powered Chatbots: Methods and Metrics","summary":"  Autonomous conversational agents, i.e. chatbots, are becoming an increasingly\ncommon mechanism for enterprises to provide support to customers and partners.\nIn order to rate chatbots, especially ones powered by Generative AI tools like\nLarge Language Models (LLMs) we need to be able to accurately assess their\nperformance. This is where chatbot benchmarking becomes important. In this\npaper, we propose the use of a novel benchmark that we call the E2E (End to\nEnd) benchmark, and show how the E2E benchmark can be used to evaluate accuracy\nand usefulness of the answers provided by chatbots, especially ones powered by\nLLMs. We evaluate an example chatbot at different levels of sophistication\nbased on both our E2E benchmark, as well as other available metrics commonly\nused in the state of art, and observe that the proposed benchmark show better\nresults compared to others. In addition, while some metrics proved to be\nunpredictable, the metric associated with the E2E benchmark, which uses cosine\nsimilarity performed well in evaluating chatbots. The performance of our best\nmodels shows that there are several benefits of using the cosine similarity\nscore as a metric in the E2E benchmark.\n","authors":["Debarag Banerjee","Pooja Singh","Arjun Avadhanam","Saksham Srivastava"],"pdf_url":"https://arxiv.org/pdf/2308.04624v1.pdf","comment":"8 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.04623v1","updated":"2023-08-08T23:29:55Z","published":"2023-08-08T23:29:55Z","title":"Accelerating LLM Inference with Staged Speculative Decoding","summary":"  Recent advances with large language models (LLM) illustrate their diverse\ncapabilities. We propose a novel algorithm, staged speculative decoding, to\naccelerate LLM inference in small-batch, on-device scenarios. We address the\nlow arithmetic intensity of small-batch inference by improving upon previous\nwork in speculative decoding. First, we restructure the speculative batch as a\ntree, which reduces generation costs and increases the expected tokens per\nbatch. Second, we add a second stage of speculative decoding. Taken together,\nwe reduce single-batch decoding latency by 3.16x with a 762M parameter GPT-2-L\nmodel while perfectly preserving output quality.\n","authors":["Benjamin Spector","Chris Re"],"pdf_url":"https://arxiv.org/pdf/2308.04623v1.pdf","comment":"Published at ES-FOMO at ICML 2023"},{"id":"http://arxiv.org/abs/2307.07415v2","updated":"2023-08-08T21:26:53Z","published":"2023-07-13T00:49:27Z","title":"AutoHint: Automatic Prompt Optimization with Hint Generation","summary":"  This paper presents AutoHint, a novel framework for automatic prompt\nengineering and optimization for Large Language Models (LLM). While LLMs have\ndemonstrated remarkable ability in achieving high-quality annotation in various\ntasks, the key to applying this ability to specific tasks lies in developing\nhigh-quality prompts. Thus we propose a framework to inherit the merits of both\nin-context learning and zero-shot learning by incorporating enriched\ninstructions derived from input-output demonstrations to optimize original\nprompt. We refer to the enrichment as the hint and propose a framework to\nautomatically generate the hint from labeled data. More concretely, starting\nfrom an initial prompt, our method first instructs a LLM to deduce new hints\nfor selected samples from incorrect predictions, and then summarizes from\nper-sample hints and adds the results back to the initial prompt to form a new,\nenriched instruction. The proposed method is evaluated on the BIG-Bench\nInstruction Induction dataset for both zero-shot and few-short prompts, where\nexperiments demonstrate our method is able to significantly boost accuracy for\nmultiple tasks.\n","authors":["Hong Sun","Xue Li","Yinchuan Xu","Youkow Homma","Qi Cao","Min Wu","Jian Jiao","Denis Charles"],"pdf_url":"https://arxiv.org/pdf/2307.07415v2.pdf","comment":"KDD 2023: Foundations and Applications in Large-scale AI\n  Models-Pre-training, Fine-tuning, and Prompt-based Learning workshop"},{"id":"http://arxiv.org/abs/2308.04592v1","updated":"2023-08-08T21:23:23Z","published":"2023-08-08T21:23:23Z","title":"Shepherd: A Critic for Language Model Generation","summary":"  As large language models improve, there is increasing interest in techniques\nthat leverage these models' capabilities to refine their own outputs. In this\nwork, we introduce Shepherd, a language model specifically tuned to critique\nresponses and suggest refinements, extending beyond the capabilities of an\nuntuned model to identify diverse errors and provide suggestions to remedy\nthem. At the core of our approach is a high quality feedback dataset, which we\ncurate from community feedback and human annotations. Even though Shepherd is\nsmall (7B parameters), its critiques are either equivalent or preferred to\nthose from established models including ChatGPT. Using GPT-4 for evaluation,\nShepherd reaches an average win-rate of 53-87% compared to competitive\nalternatives. In human evaluation, Shepherd strictly outperforms other models\nand on average closely ties with ChatGPT.\n","authors":["Tianlu Wang","Ping Yu","Xiaoqing Ellen Tan","Sean O'Brien","Ramakanth Pasunuru","Jane Dwivedi-Yu","Olga Golovneva","Luke Zettlemoyer","Maryam Fazel-Zarandi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2308.04592v1.pdf","comment":"7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.04566v1","updated":"2023-08-08T20:29:13Z","published":"2023-08-08T20:29:13Z","title":"Single-Sentence Reader: A Novel Approach for Addressing Answer Position\n  Bias","summary":"  Machine Reading Comprehension (MRC) models tend to take advantage of spurious\ncorrelations (also known as dataset bias or annotation artifacts in the\nresearch community). Consequently, these models may perform the MRC task\nwithout fully comprehending the given context and question, which is\nundesirable since it may result in low robustness against distribution shift.\nThis paper delves into the concept of answer-position bias, where a significant\npercentage of training questions have answers located solely in the first\nsentence of the context. We propose a Single-Sentence Reader as a new approach\nfor addressing answer position bias in MRC. We implement this approach using\nsix different models and thoroughly analyze their performance. Remarkably, our\nproposed Single-Sentence Readers achieve results that nearly match those of\nmodels trained on conventional training sets, proving their effectiveness. Our\nstudy also discusses several challenges our Single-Sentence Readers encounter\nand proposes a potential solution.\n","authors":["Son Quoc Tran","Matt Kretchmar"],"pdf_url":"https://arxiv.org/pdf/2308.04566v1.pdf","comment":"11 pages, 5 tables, 2 figures. arXiv admin note: text overlap with\n  arXiv:2211.16220 by other authors"},{"id":"http://arxiv.org/abs/2308.04534v1","updated":"2023-08-08T18:56:52Z","published":"2023-08-08T18:56:52Z","title":"Ahead of the Text: Leveraging Entity Preposition for Financial Relation\n  Extraction","summary":"  In the context of the ACM KDF-SIGIR 2023 competition, we undertook an entity\nrelation task on a dataset of financial entity relations called REFind. Our\ntop-performing solution involved a multi-step approach. Initially, we inserted\nthe provided entities at their corresponding locations within the text.\nSubsequently, we fine-tuned the transformer-based language model roberta-large\nfor text classification by utilizing a labeled training set to predict the\nentity relations. Lastly, we implemented a post-processing phase to identify\nand handle improbable predictions generated by the model. As a result of our\nmethodology, we achieved the 1st place ranking on the competition's public\nleaderboard.\n","authors":["Stefan Pasch","Dimitrios Petridis"],"pdf_url":"https://arxiv.org/pdf/2308.04534v1.pdf","comment":"Stefan Pasch, Dimitrios Petridis 2023. Ahead of the Text: Leveraging\n  Entity Preposition for Financial Relation Extraction. ACM SIGIR: The 4th\n  Workshop on Knowledge Discovery from Unstructured Data in Financial Services\n  (SIGIR-KDF '23)"},{"id":"http://arxiv.org/abs/2308.04519v1","updated":"2023-08-08T18:35:22Z","published":"2023-08-08T18:35:22Z","title":"DisCoCat for Donkey Sentences","summary":"  We demonstrate how to parse Geach's Donkey sentences in a compositional\ndistributional model of meaning. We build on previous work on the DisCoCat\n(Distributional Compositional Categorical) framework, including extensions that\nmodel discourse, determiners, and relative pronouns. We present a type-logical\nsyntax for parsing donkey sentences, for which we define both relational and\nvector space semantics.\n","authors":["Lachlan McPheat","Daphne Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04519v1.pdf","comment":"In Proceedings AMSLO 2023, arXiv:2308.03679"},{"id":"http://arxiv.org/abs/2308.04502v1","updated":"2023-08-08T18:11:27Z","published":"2023-08-08T18:11:27Z","title":"Revisiting Disentanglement and Fusion on Modality and Context in\n  Conversational Multimodal Emotion Recognition","summary":"  It has been a hot research topic to enable machines to understand human\nemotions in multimodal contexts under dialogue scenarios, which is tasked with\nmultimodal emotion analysis in conversation (MM-ERC). MM-ERC has received\nconsistent attention in recent years, where a diverse range of methods has been\nproposed for securing better task performance. Most existing works treat MM-ERC\nas a standard multimodal classification problem and perform multimodal feature\ndisentanglement and fusion for maximizing feature utility. Yet after revisiting\nthe characteristic of MM-ERC, we argue that both the feature multimodality and\nconversational contextualization should be properly modeled simultaneously\nduring the feature disentanglement and fusion steps. In this work, we target\nfurther pushing the task performance by taking full consideration of the above\ninsights. On the one hand, during feature disentanglement, based on the\ncontrastive learning technique, we devise a Dual-level Disentanglement\nMechanism (DDM) to decouple the features into both the modality space and\nutterance space. On the other hand, during the feature fusion stage, we propose\na Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism\n(CRM) for multimodal and context integration, respectively. They together\nschedule the proper integrations of multimodal and context features.\nSpecifically, CFM explicitly manages the multimodal feature contributions\ndynamically, while CRM flexibly coordinates the introduction of dialogue\ncontexts. On two public MM-ERC datasets, our system achieves new\nstate-of-the-art performance consistently. Further analyses demonstrate that\nall our proposed mechanisms greatly facilitate the MM-ERC task by making full\nuse of the multimodal and context features adaptively. Note that our proposed\nmethods have the great potential to facilitate a broader range of other\nconversational multimodal tasks.\n","authors":["Bobo Li","Hao Fei","Lizi Liao","Yu Zhao","Chong Teng","Tat-Seng Chua","Donghong Ji","Fei Li"],"pdf_url":"https://arxiv.org/pdf/2308.04502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04673v2","updated":"2023-08-08T18:04:11Z","published":"2023-03-08T15:52:14Z","title":"Cost-Effective Hyperparameter Optimization for Large Language Model\n  Generation Inference","summary":"  Large Language Models (LLMs) have sparked significant interest in their\ngenerative capabilities, leading to the development of various commercial\napplications. The high cost of using the models drives application builders to\nmaximize the value of generation under a limited inference budget. This paper\npresents a study of optimizing inference hyperparameters such as the number of\nresponses, temperature and max tokens, which significantly affects the\nutility/cost of text generation. We design a framework named EcoOptiGen which\nleverages economical hyperparameter optimization and cost-based pruning.\nExperiments with the GPT-3.5/GPT-4 models on a variety of tasks verify its\neffectiveness. EcoOptiGen is implemented in the `autogen' package of the FLAML\nlibrary: \\url{https://aka.ms/autogen}.\n","authors":["Chi Wang","Susan Xueqing Liu","Ahmed H. Awadallah"],"pdf_url":"https://arxiv.org/pdf/2303.04673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04498v1","updated":"2023-08-08T18:03:29Z","published":"2023-08-08T18:03:29Z","title":"DialogRE^C+: An Extension of DialogRE to Investigate How Much\n  Coreference Helps Relation Extraction in Dialogs","summary":"  Dialogue relation extraction (DRE) that identifies the relations between\nargument pairs in dialogue text, suffers much from the frequent occurrence of\npersonal pronouns, or entity and speaker coreference. This work introduces a\nnew benchmark dataset DialogRE^C+, introducing coreference resolution into the\nDRE scenario. With the aid of high-quality coreference knowledge, the reasoning\nof argument relations is expected to be enhanced. In DialogRE^C+ dataset, we\nmanually annotate total 5,068 coreference chains over 36,369 argument mentions\nbased on the existing DialogRE data, where four different coreference chain\ntypes namely speaker chain, person chain, location chain and organization chain\nare explicitly marked. We further develop 4 coreference-enhanced graph-based\nDRE models, which learn effective coreference representations for improving the\nDRE task. We also train a coreference resolution model based on our annotations\nand evaluate the effect of automatically extracted coreference chains\ndemonstrating the practicality of our dataset and its potential to other\ndomains and tasks.\n","authors":["Yiyun Xiong","Mengwei Dai","Fei Li","Hao Fei","Bobo Li","Shengqiong Wu","Donghong Ji","Chong Teng"],"pdf_url":"https://arxiv.org/pdf/2308.04498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00595v2","updated":"2023-08-08T17:16:03Z","published":"2023-03-01T15:35:32Z","title":"A Universal Question-Answering Platform for Knowledge Graphs","summary":"  Knowledge from diverse application domains is organized as knowledge graphs\n(KGs) that are stored in RDF engines accessible in the web via SPARQL\nendpoints. Expressing a well-formed SPARQL query requires information about the\ngraph structure and the exact URIs of its components, which is impractical for\nthe average user. Question answering (QA) systems assist by translating natural\nlanguage questions to SPARQL. Existing QA systems are typically based on\napplication-specific human-curated rules, or require prior information,\nexpensive pre-processing and model adaptation for each targeted KG. Therefore,\nthey are hard to generalize to a broad set of applications and KGs.\n  In this paper, we propose KGQAn, a universal QA system that does not need to\nbe tailored to each target KG. Instead of curated rules, KGQAn introduces a\nnovel formalization of question understanding as a text generation problem to\nconvert a question into an intermediate abstract representation via a neural\nsequence-to-sequence model. We also develop a just-in-time linker that maps at\nquery time the abstract representation to a SPARQL query for a specific KG,\nusing only the publicly accessible APIs and the existing indices of the RDF\nstore, without requiring any pre-processing. Our experiments with several real\nKGs demonstrate that KGQAn is easily deployed and outperforms by a large margin\nthe state-of-the-art in terms of quality of answers and processing time,\nespecially for arbitrary KGs, unseen during the training.\n","authors":["Reham Omar","Ishika Dhall","Panos Kalnis","Essam Mansour"],"pdf_url":"https://arxiv.org/pdf/2303.00595v2.pdf","comment":"The paper is accepted to SIGMOD 2023"},{"id":"http://arxiv.org/abs/2308.04226v1","updated":"2023-08-08T12:45:01Z","published":"2023-08-08T12:45:01Z","title":"OpinionConv: Conversational Product Search with Grounded Opinions","summary":"  When searching for products, the opinions of others play an important role in\nmaking informed decisions. Subjective experiences about a product can be a\nvaluable source of information. This is also true in sales conversations, where\na customer and a sales assistant exchange facts and opinions about products.\nHowever, training an AI for such conversations is complicated by the fact that\nlanguage models do not possess authentic opinions for their lack of real-world\nexperience. We address this problem by leveraging product reviews as a rich\nsource of product opinions to ground conversational AI in true subjective\nnarratives. With OpinionConv, we develop the first conversational AI for\nsimulating sales conversations. To validate the generated conversations, we\nconduct several user studies showing that the generated opinions are perceived\nas realistic. Our assessors also confirm the importance of opinions as an\ninformative basis for decision-making.\n","authors":["Vahid Sadiri Javadi","Martin Potthast","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.04226v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.04431v1","updated":"2023-08-08T17:58:45Z","published":"2023-08-08T17:58:45Z","title":"When More is Less: Incorporating Additional Datasets Can Hurt\n  Performance By Introducing Spurious Correlations","summary":"  In machine learning, incorporating more data is often seen as a reliable\nstrategy for improving model performance; this work challenges that notion by\ndemonstrating that the addition of external datasets in many cases can hurt the\nresulting model's performance. In a large-scale empirical study across\ncombinations of four different open-source chest x-ray datasets and 9 different\nlabels, we demonstrate that in 43% of settings, a model trained on data from\ntwo hospitals has poorer worst group accuracy over both hospitals than a model\ntrained on just a single hospital's data. This surprising result occurs even\nthough the added hospital makes the training distribution more similar to the\ntest distribution. We explain that this phenomenon arises from the spurious\ncorrelation that emerges between the disease and hospital, due to\nhospital-specific image artifacts. We highlight the trade-off one encounters\nwhen training on multiple datasets, between the obvious benefit of additional\ndata and insidious cost of the introduced spurious correlation. In some cases,\nbalancing the dataset can remove the spurious correlation and improve\nperformance, but it is not always an effective strategy. We contextualize our\nresults within the literature on spurious correlations to help explain these\noutcomes. Our experiments underscore the importance of exercising caution when\nselecting training data for machine learning models, especially in settings\nwhere there is a risk of spurious correlations such as with medical imaging.\nThe risks outlined highlight the need for careful data selection and model\nevaluation in future research and practice.\n","authors":["Rhys Compton","Lily Zhang","Aahlad Puli","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2308.04431v1.pdf","comment":"Accepted at MLHC 2023"},{"id":"http://arxiv.org/abs/2308.04426v1","updated":"2023-08-08T17:55:30Z","published":"2023-08-08T17:55:30Z","title":"A Deep-Learning Method Using Auto-encoder and Generative Adversarial\n  Network for Anomaly Detection on Ancient Stone Stele Surfaces","summary":"  Accurate detection of natural deterioration and man-made damage on the\nsurfaces of ancient stele in the first instance is essential for their\npreventive conservation. Existing methods for cultural heritage preservation\nare not able to achieve this goal perfectly due to the difficulty of balancing\naccuracy, efficiency, timeliness, and cost. This paper presents a deep-learning\nmethod to automatically detect above mentioned emergencies on ancient stone\nstele in real time, employing autoencoder (AE) and generative adversarial\nnetwork (GAN). The proposed method overcomes the limitations of existing\nmethods by requiring no extensive anomaly samples while enabling comprehensive\ndetection of unpredictable anomalies. the method includes stages of monitoring,\ndata acquisition, pre-processing, model structuring, and post-processing.\nTaking the Longmen Grottoes' stone steles as a case study, an unsupervised\nlearning model based on AE and GAN architectures is proposed and validated with\na reconstruction accuracy of 99.74\\%. The method's evaluation revealed the\nproficient detection of seven artificially designed anomalies and demonstrated\nprecision and reliability without false alarms. This research provides novel\nideas and possibilities for the application of deep learning in the field of\ncultural heritage.\n","authors":["Yikun Liu","Yuning Wang","Cheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04170v3","updated":"2023-08-08T17:49:29Z","published":"2023-05-07T03:00:06Z","title":"YOLOCS: Object Detection based on Dense Channel Compression for Feature\n  Spatial Solidification","summary":"  In this study, we examine the associations between channel features and\nconvolutional kernels during the processes of feature purification and gradient\nbackpropagation, with a focus on the forward and backward propagation within\nthe network. Consequently, we propose a method called Dense Channel Compression\nfor Feature Spatial Solidification. Drawing upon the central concept of this\nmethod, we introduce two innovative modules for backbone and head networks: the\nDense Channel Compression for Feature Spatial Solidification Structure (DCFS)\nand the Asymmetric Multi-Level Compression Decoupled Head (ADH). When\nintegrated into the YOLOv5 model, these two modules demonstrate exceptional\nperformance, resulting in a modified model referred to as YOLOCS. Evaluated on\nthe MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of\n50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably\nsimilar to those of the YOLOv5 model, the large, medium, and small YOLOCS\nmodels surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Haojie Fu","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.04170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04417v1","updated":"2023-08-08T17:34:28Z","published":"2023-08-08T17:34:28Z","title":"DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from\n  Optical Satellite Images","summary":"  Optical satellite images are a critical data source; however, cloud cover\noften compromises their quality, hindering image applications and analysis.\nConsequently, effectively removing clouds from optical satellite images has\nemerged as a prominent research direction. While recent advancements in cloud\nremoval primarily rely on generative adversarial networks, which may yield\nsuboptimal image quality, diffusion models have demonstrated remarkable success\nin diverse image-generation tasks, showcasing their potential in addressing\nthis challenge. This paper presents a novel framework called DiffCR, which\nleverages conditional guided diffusion with deep convolutional networks for\nhigh-performance cloud removal for optical satellite imagery. Specifically, we\nintroduce a decoupled encoder for conditional image feature extraction,\nproviding a robust color representation to ensure the close similarity of\nappearance information between the conditional input and the synthesized\noutput. Moreover, we propose a novel and efficient time and condition fusion\nblock within the cloud removal model to accurately simulate the correspondence\nbetween the appearance in the conditional image and the target image at a low\ncomputational cost. Extensive experimental evaluations on two commonly used\nbenchmark datasets demonstrate that DiffCR consistently achieves\nstate-of-the-art performance on all metrics, with parameter and computational\ncomplexities amounting to only 5.1% and 5.4%, respectively, of those previous\nbest methods. The source code, pre-trained models, and all the experimental\nresults will be publicly available at https://github.com/XavierJiezou/DiffCR\nupon the paper's acceptance of this work.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Yu Zhang","Shiying Wang","Lei Jin","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04417v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.09345v2","updated":"2023-08-08T17:26:58Z","published":"2023-06-15T17:59:51Z","title":"Evaluating Data Attribution for Text-to-Image Models","summary":"  While large text-to-image models are able to synthesize \"novel\" images, these\nimages are necessarily a reflection of the training data. The problem of data\nattribution in such models -- which of the images in the training set are most\nresponsible for the appearance of a given generated image -- is a difficult yet\nimportant one. As an initial step toward this problem, we evaluate attribution\nthrough \"customization\" methods, which tune an existing large-scale model\ntoward a given exemplar object or style. Our key insight is that this allows us\nto efficiently create synthetic images that are computationally influenced by\nthe exemplar by construction. With our new dataset of such exemplar-influenced\nimages, we are able to evaluate various data attribution algorithms and\ndifferent possible feature spaces. Furthermore, by training on our dataset, we\ncan tune standard models, such as DINO, CLIP, and ViT, toward the attribution\nproblem. Even though the procedure is tuned towards small exemplar sets, we\nshow generalization to larger sets. Finally, by taking into account the\ninherent uncertainty of the problem, we can assign soft attribution scores over\na set of training images.\n","authors":["Sheng-Yu Wang","Alexei A. Efros","Jun-Yan Zhu","Richard Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09345v2.pdf","comment":"Updated v2 -- ICCV 2023 camera ready version. Project page:\n  https://peterwang512.github.io/GenDataAttribution Code:\n  https://github.com/PeterWang512/GenDataAttribution"},{"id":"http://arxiv.org/abs/2308.04413v1","updated":"2023-08-08T17:18:59Z","published":"2023-08-08T17:18:59Z","title":"Digging into Depth Priors for Outdoor Neural Radiance Fields","summary":"  Neural Radiance Fields (NeRF) have demonstrated impressive performance in\nvision and graphics tasks, such as novel view synthesis and immersive reality.\nHowever, the shape-radiance ambiguity of radiance fields remains a challenge,\nespecially in the sparse viewpoints setting. Recent work resorts to integrating\ndepth priors into outdoor NeRF training to alleviate the issue. However, the\ncriteria for selecting depth priors and the relative merits of different priors\nhave not been thoroughly investigated. Moreover, the relative merits of\nselecting different approaches to use the depth priors is also an unexplored\nproblem. In this paper, we provide a comprehensive study and evaluation of\nemploying depth priors to outdoor neural radiance fields, covering common depth\nsensing technologies and most application ways. Specifically, we conduct\nextensive experiments with two representative NeRF methods equipped with four\ncommonly-used depth priors and different depth usages on two widely used\noutdoor datasets. Our experimental results reveal several interesting findings\nthat can potentially benefit practitioners and researchers in training their\nNeRF models with depth priors. Project Page:\nhttps://cwchenwang.github.io/outdoor-nerf-depth\n","authors":["Chen Wang","Jiadai Sun","Lina Liu","Chenming Wu","Zhelun Shen","Dayan Wu","Yuchao Dai","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04413v1.pdf","comment":"Accepted to ACM MM 2023. Project Page:\n  https://cwchenwang.github.io/outdoor-nerf-depth"},{"id":"http://arxiv.org/abs/2308.04409v1","updated":"2023-08-08T17:14:14Z","published":"2023-08-08T17:14:14Z","title":"V-DETR: DETR with Vertex Relative Position Encoding for 3D Object\n  Detection","summary":"  We introduce a highly performant 3D object detector for point clouds using\nthe DETR framework. The prior attempts all end up with suboptimal results\nbecause they fail to learn accurate inductive biases from the limited scale of\ntraining data. In particular, the queries often attend to points that are far\naway from the target objects, violating the locality principle in object\ndetection. To address the limitation, we introduce a novel 3D Vertex Relative\nPosition Encoding (3DV-RPE) method which computes position encoding for each\npoint based on its relative position to the 3D boxes predicted by the queries\nin each decoder layer, thus providing clear information to guide the model to\nfocus on points near the objects, in accordance with the principle of locality.\nIn addition, we systematically improve the pipeline from various aspects such\nas data normalization based on our understanding of the task. We show\nexceptional results on the challenging ScanNetV2 benchmark, achieving\nsignificant improvements over the previous 3DETR in\n$\\rm{AP}_{25}$/$\\rm{AP}_{50}$ from 65.0\\%/47.0\\% to 77.8\\%/66.0\\%,\nrespectively. In addition, our method sets a new record on ScanNetV2 and SUN\nRGB-D datasets.Code will be released at http://github.com/yichaoshen-MS/V-DETR.\n","authors":["Yichao Shen","Zigang Geng","Yuhui Yuan","Yutong Lin","Ze Liu","Chunyu Wang","Han Hu","Nanning Zheng","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2308.04409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04402v1","updated":"2023-08-08T17:04:53Z","published":"2023-08-08T17:04:53Z","title":"Person Re-Identification without Identification via Event Anonymization","summary":"  Wide-scale use of visual surveillance in public spaces puts individual\nprivacy at stake while increasing resource consumption (energy, bandwidth, and\ncomputation). Neuromorphic vision sensors (event-cameras) have been recently\nconsidered a valid solution to the privacy issue because they do not capture\ndetailed RGB visual information of the subjects in the scene. However, recent\ndeep learning architectures have been able to reconstruct images from event\ncameras with high fidelity, reintroducing a potential threat to privacy for\nevent-based vision applications. In this paper, we aim to anonymize\nevent-streams to protect the identity of human subjects against such image\nreconstruction attacks. To achieve this, we propose an end-to-end network\narchitecture jointly optimized for the twofold objective of preserving privacy\nand performing a downstream task such as person ReId. Our network learns to\nscramble events, enforcing the degradation of images recovered from the privacy\nattacker. In this work, we also bring to the community the first ever\nevent-based person ReId dataset gathered to evaluate the performance of our\napproach. We validate our approach with extensive experiments and report\nresults on the synthetic event data simulated from the publicly available\nSoftBio dataset and our proposed Event-ReId dataset.\n","authors":["Shafiq Ahmad","Pietro Morerio","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.04402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04397v1","updated":"2023-08-08T17:01:33Z","published":"2023-08-08T17:01:33Z","title":"LEFormer: A Hybrid CNN-Transformer Architecture for Accurate Lake\n  Extraction from Remote Sensing Imagery","summary":"  Lake extraction from remote sensing imagery is challenging due to the complex\nshapes of lakes and the presence of noise. Existing methods suffer from blurred\nsegmentation boundaries and poor foreground modeling. In this paper, we propose\na hybrid CNN-Transformer architecture, called LEFormer, for accurate lake\nextraction. LEFormer contains four main modules: CNN encoder, Transformer\nencoder, cross-encoder fusion, and lightweight decoder. The CNN encoder\nrecovers local spatial information and improves fine-scale details.\nSimultaneously, the Transformer encoder captures long-range dependencies\nbetween sequences of any length, allowing them to obtain global features and\ncontext information better. Finally, a lightweight decoder is employed for mask\nprediction. We evaluate the performance and efficiency of LEFormer on two\ndatasets, the Surface Water (SW) and the Qinghai-Tibet Plateau Lake (QTPL).\nExperimental results show that LEFormer consistently achieves state-of-the-art\n(SOTA) performance and efficiency on these two datasets, outperforming existing\nmethods. Specifically, LEFormer achieves 90.86% and 97.42% mIoU on the SW and\nQTPL datasets with a parameter count of 3.61M, respectively, while being 20x\nminor than the previous SOTA method.\n","authors":["Ben Chen","Xuechao Zou","Yu Zhang","Jiayu Li","Kai Li","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04397v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04395v1","updated":"2023-08-08T17:00:11Z","published":"2023-08-08T17:00:11Z","title":"Data Augmentation-Based Unsupervised Domain Adaptation In Medical\n  Imaging","summary":"  Deep learning-based models in medical imaging often struggle to generalize\neffectively to new scans due to data heterogeneity arising from differences in\nhardware, acquisition parameters, population, and artifacts. This limitation\npresents a significant challenge in adopting machine learning models for\nclinical practice. We propose an unsupervised method for robust domain\nadaptation in brain MRI segmentation by leveraging MRI-specific augmentation\ntechniques. To evaluate the effectiveness of our method, we conduct extensive\nexperiments across diverse datasets, modalities, and segmentation tasks,\ncomparing against the state-of-the-art methods. The results show that our\nproposed approach achieves high accuracy, exhibits broad applicability, and\nshowcases remarkable robustness against domain shift in various tasks,\nsurpassing the state-of-the-art performance in the majority of cases.\n","authors":["Sebastian Nørgaard Llambias","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2308.04395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04383v1","updated":"2023-08-08T16:37:24Z","published":"2023-08-08T16:37:24Z","title":"DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point\n  Clouds","summary":"  Point clouds are naturally sparse, while image pixels are dense. The\ninconsistency limits feature fusion from both modalities for point-wise scene\nflow estimation. Previous methods rarely predict scene flow from the entire\npoint clouds of the scene with one-time inference due to the memory\ninefficiency and heavy overhead from distance calculation and sorting involved\nin commonly used farthest point sampling, KNN, and ball query algorithms for\nlocal feature aggregation. To mitigate these issues in scene flow learning, we\nregularize raw points to a dense format by storing 3D coordinates in 2D grids.\nUnlike the sampling operation commonly used in existing works, the dense 2D\nrepresentation 1) preserves most points in the given scene, 2) brings in a\nsignificant boost of efficiency, and 3) eliminates the density gap between\npoints and pixels, allowing us to perform effective feature fusion. We also\npresent a novel warping projection technique to alleviate the information loss\nproblem resulting from the fact that multiple points could be mapped into one\ngrid during projection when computing cost volume. Sufficient experiments\ndemonstrate the efficiency and effectiveness of our method, outperforming the\nprior-arts on the FlyingThings3D and KITTI dataset.\n","authors":["Chensheng Peng","Guangming Wang","Xian Wan Lo","Xinrui Wu","Chenfeng Xu","Masayoshi Tomizuka","Wei Zhan","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04383v1.pdf","comment":"Accepted by ICCV2023. Codes will be released at\n  https://github.com/IRMVLab/DELFlow"},{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04373v1","updated":"2023-08-08T16:22:44Z","published":"2023-08-08T16:22:44Z","title":"Pelta: Shielding Transformers to Mitigate Evasion Attacks in Federated\n  Learning","summary":"  The main premise of federated learning is that machine learning model updates\nare computed locally, in particular to preserve user data privacy, as those\nnever leave the perimeter of their device. This mechanism supposes the general\nmodel, once aggregated, to be broadcast to collaborating and non malicious\nnodes. However, without proper defenses, compromised clients can easily probe\nthe model inside their local memory in search of adversarial examples. For\ninstance, considering image-based applications, adversarial examples consist of\nimperceptibly perturbed images (to the human eye) misclassified by the local\nmodel, which can be later presented to a victim node's counterpart model to\nreplicate the attack. To mitigate such malicious probing, we introduce Pelta, a\nnovel shielding mechanism leveraging trusted hardware. By harnessing the\ncapabilities of Trusted Execution Environments (TEEs), Pelta masks part of the\nback-propagation chain rule, otherwise typically exploited by attackers for the\ndesign of malicious samples. We evaluate Pelta on a state of the art ensemble\nmodel and demonstrate its effectiveness against the Self Attention Gradient\nadversarial Attack.\n","authors":["Simon Queyrut","Yérom-David Bromberg","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2308.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04370v1","updated":"2023-08-08T16:17:46Z","published":"2023-08-08T16:17:46Z","title":"When Super-Resolution Meets Camouflaged Object Detection: A Comparison\n  Study","summary":"  Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot\ntopics in computer vision with various joint applications. For instance,\nlow-resolution surveillance images can be successively processed by\nsuper-resolution techniques and camouflaged object detection. However, in\nprevious work, these two areas are always studied in isolation. In this paper,\nwe, for the first time, conduct an integrated comparative evaluation for both.\nSpecifically, we benchmark different super-resolution methods on commonly used\nCOD datasets, and meanwhile, we evaluate the robustness of different COD models\nby using COD data processed by SR methods. Our goal is to bridge these two\ndomains, discover novel experimental phenomena, summarize new experim.\n","authors":["Juan Wen","Shupeng Cheng","Peng Xu","Bowen Zhou","Radu Timofte","Weiyan Hou","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2308.04370v1.pdf","comment":"23 pages with 8 figures"},{"id":"http://arxiv.org/abs/2308.04369v1","updated":"2023-08-08T16:15:35Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n  Transformer for Frame-Event based Recognition","summary":"  Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2303.09040v2","updated":"2023-08-08T16:14:32Z","published":"2023-03-16T02:24:31Z","title":"Hybrid Spectral Denoising Transformer with Guided Attention","summary":"  In this paper, we present a Hybrid Spectral Denoising Transformer (HSDT) for\nhyperspectral image denoising. Challenges in adapting transformer for HSI arise\nfrom the capabilities to tackle existing limitations of CNN-based methods in\ncapturing the global and local spatial-spectral correlations while maintaining\nefficiency and flexibility. To address these issues, we introduce a hybrid\napproach that combines the advantages of both models with a Spatial-Spectral\nSeparable Convolution (S3Conv), Guided Spectral Self-Attention (GSSA), and\nSelf-Modulated Feed-Forward Network (SM-FFN). Our S3Conv works as a lightweight\nalternative to 3D convolution, which extracts more spatial-spectral correlated\nfeatures while keeping the flexibility to tackle HSIs with an arbitrary number\nof bands. These features are then adaptively processed by GSSA which per-forms\n3D self-attention across the spectral bands, guided by a set of learnable\nqueries that encode the spectral signatures. This not only enriches our model\nwith powerful capabilities for identifying global spectral correlations but\nalso maintains linear complexity. Moreover, our SM-FFN proposes the\nself-modulation that intensifies the activations of more informative regions,\nwhich further strengthens the aggregated features. Extensive experiments are\nconducted on various datasets under both simulated and real-world noise, and it\nshows that our HSDT significantly outperforms the existing state-of-the-art\nmethods while maintaining low computational overhead. Code is at https:\n//github.com/Zeqiang-Lai/HSDT.\n","authors":["Zeqiang Lai","Chenggang Yan","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09040v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.07916v3","updated":"2023-08-08T16:06:11Z","published":"2023-04-16T23:37:24Z","title":"GaitRef: Gait Recognition with Refined Sequential Skeletons","summary":"  Identifying humans with their walking sequences, known as gait recognition,\nis a useful biometric understanding task as it can be observed from a long\ndistance and does not require cooperation from the subject. Two common\nmodalities used for representing the walking sequence of a person are\nsilhouettes and joint skeletons. Silhouette sequences, which record the\nboundary of the walking person in each frame, may suffer from the variant\nappearances from carried-on objects and clothes of the person. Framewise joint\ndetections are noisy and introduce some jitters that are not consistent with\nsequential detections. In this paper, we combine the silhouettes and skeletons\nand refine the framewise joint predictions for gait recognition. With temporal\ninformation from the silhouette sequences, we show that the refined skeletons\ncan improve gait recognition performance without extra annotations. We compare\nour methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show\nstate-of-the-art performance.\n","authors":["Haidong Zhu","Wanrong Zheng","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2304.07916v3.pdf","comment":"IJCB 2023 oral. Code is available at\n  https://github.com/haidongz-usc/GaitRef"},{"id":"http://arxiv.org/abs/2303.16565v2","updated":"2023-08-08T16:01:41Z","published":"2023-03-29T09:47:48Z","title":"PMAA: A Progressive Multi-scale Attention Autoencoder Model for\n  High-performance Cloud Removal from Multi-temporal Satellite Imagery","summary":"  Satellite imagery analysis plays a pivotal role in remote sensing; however,\ninformation loss due to cloud cover significantly impedes its application.\nAlthough existing deep cloud removal models have achieved notable outcomes,\nthey scarcely consider contextual information. This study introduces a\nhigh-performance cloud removal architecture, termed Progressive Multi-scale\nAttention Autoencoder (PMAA), which concurrently harnesses global and local\ninformation to construct robust contextual dependencies using a novel\nMulti-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).\nPMAA establishes long-range dependencies of multi-scale features using MAM and\nmodulates the reconstruction of fine-grained details utilizing LIM, enabling\nsimultaneous representation of fine- and coarse-grained features at the same\nlevel. With the help of diverse and multi-scale features, PMAA consistently\noutperforms the previous state-of-the-art model CTGAN on two benchmark\ndatasets. Moreover, PMAA boasts considerable efficiency advantages, with only\n0.5% and 14.6% of the parameters and computational complexity of CTGAN,\nrespectively. These comprehensive results underscore PMAA's potential as a\nlightweight cloud removal network suitable for deployment on edge devices to\naccomplish large-scale cloud removal tasks. Our source code and pre-trained\nmodels are available at https://github.com/XavierJiezou/PMAA.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Pin Tao","Yachao Cui"],"pdf_url":"https://arxiv.org/pdf/2303.16565v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.04356v1","updated":"2023-08-08T16:01:11Z","published":"2023-08-08T16:01:11Z","title":"Learning Unbiased Image Segmentation: A Case Study with Plain Knee\n  Radiographs","summary":"  Automatic segmentation of knee bony anatomy is essential in orthopedics, and\nit has been around for several years in both pre-operative and post-operative\nsettings. While deep learning algorithms have demonstrated exceptional\nperformance in medical image analysis, the assessment of fairness and potential\nbiases within these models remains limited. This study aims to revisit deep\nlearning-powered knee-bony anatomy segmentation using plain radiographs to\nuncover visible gender and racial biases. The current contribution offers the\npotential to advance our understanding of biases, and it provides practical\ninsights for researchers and practitioners in medical imaging. The proposed\nmitigation strategies mitigate gender and racial biases, ensuring fair and\nunbiased segmentation results. Furthermore, this work promotes equal access to\naccurate diagnoses and treatment outcomes for diverse patient populations,\nfostering equitable and inclusive healthcare provision.\n","authors":["Nickolas Littlefield","Johannes F. Plate","Kurt R. Weiss","Ines Lohse","Avani Chhabra","Ismaeel A. Siddiqui","Zoe Menezes","George Mastorakos","Sakshi Mehul Thakar","Mehrnaz Abedian","Matthew F. Gong","Luke A. Carlson","Hamidreza Moradi","Soheyla Amirian","Ahmad P. Tafti"],"pdf_url":"https://arxiv.org/pdf/2308.04356v1.pdf","comment":"This paper has been accepted by IEEE BHI 2023"},{"id":"http://arxiv.org/abs/2308.04352v1","updated":"2023-08-08T15:59:17Z","published":"2023-08-08T15:59:17Z","title":"3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment","summary":"  3D vision-language grounding (3D-VL) is an emerging field that aims to\nconnect the 3D physical world with natural language, which is crucial for\nachieving embodied intelligence. Current 3D-VL models rely heavily on\nsophisticated modules, auxiliary losses, and optimization tricks, which calls\nfor a simple and unified model. In this paper, we propose 3D-VisTA, a\npre-trained Transformer for 3D Vision and Text Alignment that can be easily\nadapted to various downstream tasks. 3D-VisTA simply utilizes self-attention\nlayers for both single-modal modeling and multi-modal fusion without any\nsophisticated task-specific design. To further enhance its performance on 3D-VL\ntasks, we construct ScanScribe, the first large-scale 3D scene-text pairs\ndataset for 3D-VL pre-training. ScanScribe contains 2,995 RGB-D scans for 1,185\nunique indoor scenes originating from ScanNet and 3R-Scan datasets, along with\npaired 278K scene descriptions generated from existing 3D-VL tasks, templates,\nand GPT-3. 3D-VisTA is pre-trained on ScanScribe via masked language/object\nmodeling and scene-text matching. It achieves state-of-the-art results on\nvarious 3D-VL tasks, ranging from visual grounding and dense captioning to\nquestion answering and situated reasoning. Moreover, 3D-VisTA demonstrates\nsuperior data efficiency, obtaining strong performance even with limited\nannotations during downstream task fine-tuning.\n","authors":["Ziyu Zhu","Xiaojian Ma","Yixin Chen","Zhidong Deng","Siyuan Huang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2308.04352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16181v3","updated":"2023-08-08T15:50:35Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":"  Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04517v2","updated":"2023-08-08T15:50:11Z","published":"2023-05-08T07:22:37Z","title":"DiffBFR: Bootstrapping Diffusion Model Towards Blind Face Restoration","summary":"  Blind face restoration (BFR) is important while challenging. Prior works\nprefer to exploit GAN-based frameworks to tackle this task due to the balance\nof quality and efficiency. However, these methods suffer from poor stability\nand adaptability to long-tail distribution, failing to simultaneously retain\nsource identity and restore detail. We propose DiffBFR to introduce Diffusion\nProbabilistic Model (DPM) for BFR to tackle the above problem, given its\nsuperiority over GAN in aspects of avoiding training collapse and generating\nlong-tail distribution. DiffBFR utilizes a two-step design, that first restores\nidentity information from low-quality images and then enhances texture details\naccording to the distribution of real faces. This design is implemented with\ntwo key components: 1) Identity Restoration Module (IRM) for preserving the\nface details in results. Instead of denoising from pure Gaussian random\ndistribution with LQ images as the condition during the reverse process, we\npropose a novel truncated sampling method which starts from LQ images with part\nnoise added. We theoretically prove that this change shrinks the evidence lower\nbound of DPM and then restores more original details. With theoretical proof,\ntwo cascade conditional DPMs with different input sizes are introduced to\nstrengthen this sampling effect and reduce training difficulty in the\nhigh-resolution image generated directly. 2) Texture Enhancement Module (TEM)\nfor polishing the texture of the image. Here an unconditional DPM, a LQ-free\nmodel, is introduced to further force the restorations to appear realistic. We\ntheoretically proved that this unconditional DPM trained on pure HQ images\ncontributes to justifying the correct distribution of inference images output\nfrom IRM in pixel-level space. Truncated sampling with fractional time step is\nutilized to polish pixel-level textures while preserving identity information.\n","authors":["Xinmin Qiu","Congying Han","Zicheng Zhang","Bonan Li","Tiande Guo","Xuecheng Nie"],"pdf_url":"https://arxiv.org/pdf/2305.04517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04340v1","updated":"2023-08-08T15:36:57Z","published":"2023-08-08T15:36:57Z","title":"A Lightweight and Accurate Face Detection Algorithm Based on Retinaface","summary":"  In this paper, we propose a lightweight and accurate face detection algorithm\nLAFD (Light and accurate face detection) based on Retinaface. Backbone network\nin the algorithm is a modified MobileNetV3 network which adjusts the size of\nthe convolution kernel, the channel expansion multiplier of the inverted\nresiduals block and the use of the SE attention mechanism. Deformable\nconvolution network(DCN) is introduced in the context module and the algorithm\nuses focal loss function instead of cross-entropy loss function as the\nclassification loss function of the model. The test results on the WIDERFACE\ndataset indicate that the average accuracy of LAFD is 94.1%, 92.2% and 82.1%\nfor the \"easy\", \"medium\" and \"hard\" validation subsets respectively with an\nimprovement of 3.4%, 4.0% and 8.3% compared to Retinaface and 3.1%, 4.1% and\n4.1% higher than the well-performing lightweight model, LFFD. If the input\nimage is pre-processed and scaled to 1560px in length or 1200px in width, the\nmodel achieves an average accuracy of 86.2% on the 'hard' validation subset.\nThe model is lightweight, with a size of only 10.2MB.\n","authors":["Baozhu Liu","Hewei Yu"],"pdf_url":"https://arxiv.org/pdf/2308.04340v1.pdf","comment":"14 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.04337v1","updated":"2023-08-08T15:30:08Z","published":"2023-08-08T15:30:08Z","title":"Pengembangan Model untuk Mendeteksi Kerusakan pada Terumbu Karang dengan\n  Klasifikasi Citra","summary":"  The abundant biodiversity of coral reefs in Indonesian waters is a valuable\nasset that needs to be preserved. Rapid climate change and uncontrolled human\nactivities have led to the degradation of coral reef ecosystems, including\ncoral bleaching, which is a critical indicator of coral health conditions.\nTherefore, this research aims to develop an accurate classification model to\ndistinguish between healthy corals and corals experiencing bleaching. This\nstudy utilizes a specialized dataset consisting of 923 images collected from\nFlickr using the Flickr API. The dataset comprises two distinct classes:\nhealthy corals (438 images) and bleached corals (485 images). These images have\nbeen resized to a maximum of 300 pixels in width or height, whichever is\nlarger, to maintain consistent sizes across the dataset.\n  The method employed in this research involves the use of machine learning\nmodels, particularly convolutional neural networks (CNN), to recognize and\ndifferentiate visual patterns associated with healthy and bleached corals. In\nthis context, the dataset can be used to train and test various classification\nmodels to achieve optimal results. By leveraging the ResNet model, it was found\nthat a from-scratch ResNet model can outperform pretrained models in terms of\nprecision and accuracy. The success in developing accurate classification\nmodels will greatly benefit researchers and marine biologists in gaining a\nbetter understanding of coral reef health. These models can also be employed to\nmonitor changes in the coral reef environment, thereby making a significant\ncontribution to conservation and ecosystem restoration efforts that have\nfar-reaching impacts on life.\n","authors":["Fadhil Muhammad","Alif Bintang Elfandra","Iqbal Pahlevi Amin","Alfan Farizki Wicaksono"],"pdf_url":"https://arxiv.org/pdf/2308.04337v1.pdf","comment":"in Indonesian language"},{"id":"http://arxiv.org/abs/2305.12522v2","updated":"2023-08-08T15:22:26Z","published":"2023-05-21T17:46:28Z","title":"P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic\n  Segmentation","summary":"  To mitigate the necessity for large amounts of supervised segmentation\nannotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)\nstrategies have been devised. These will often rely on advanced data and model\nregularization strategies to instigate the development of useful properties\n(e.g., prediction completeness and fidelity to semantic boundaries) in\nsegmentation priors, notwithstanding the lack of annotated information. In this\nwork, we first create a strong baseline by analyzing complementary WSSS\ntechniques and regularizing strategies, considering their strengths and\nlimitations. We then propose a new Class-specific Adversarial Erasing strategy,\ncomprising two adversarial CAM generating networks being gradually refined to\nproduce robust semantic segmentation proposals. Empirical results suggest that\nour approach induces substantial improvement in the effectiveness of the\nbaseline, resulting in a noticeable improvement over both Pascal VOC 2012 and\nMS COCO 2014 datasets.\n","authors":["Lucas David","Helio Pedrini","Zanoni Dias"],"pdf_url":"https://arxiv.org/pdf/2305.12522v2.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04322v1","updated":"2023-08-08T15:15:51Z","published":"2023-08-08T15:15:51Z","title":"Domain Adaptive Person Search via GAN-based Scene Synthesis for\n  Cross-scene Videos","summary":"  Person search has recently been a challenging task in the computer vision\ndomain, which aims to search specific pedestrians from real\ncameras.Nevertheless, most surveillance videos comprise only a handful of\nimages of each pedestrian, which often feature identical backgrounds and\nclothing. Hence, it is difficult to learn more discriminative features for\nperson search in real scenes. To tackle this challenge, we draw on Generative\nAdversarial Networks (GAN) to synthesize data from surveillance videos. GAN has\nthrived in computer vision problems because it produces high-quality images\nefficiently. We merely alter the popular Fast R-CNN model, which is capable of\nprocessing videos and yielding accurate detection outcomes. In order to\nappropriately relieve the pressure brought by the two-stage model, we design an\nAssisted-Identity Query Module (AIDQ) to provide positive images for the behind\npart. Besides, the proposed novel GAN-based Scene Synthesis model that can\nsynthesize high-quality cross-id person images for person search tasks. In\norder to facilitate the feature learning of the GAN-based Scene Synthesis\nmodel, we adopt an online learning strategy that collaboratively learns the\nsynthesized images and original images. Extensive experiments on two widely\nused person search benchmarks, CUHK-SYSU and PRW, have shown that our method\nhas achieved great performance, and the extensive ablation study further\njustifies our GAN-synthetic data can effectively increase the variability of\nthe datasets and be more realistic.\n","authors":["Huibing Wang","Tianxiang Cui","Mingze Yao","Huijuan Pang","Yushan Du"],"pdf_url":"https://arxiv.org/pdf/2308.04322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04321v1","updated":"2023-08-08T15:14:23Z","published":"2023-08-08T15:14:23Z","title":"All-pairs Consistency Learning for Weakly Supervised Semantic\n  Segmentation","summary":"  In this work, we propose a new transformer-based regularization to better\nlocalize objects for Weakly supervised semantic segmentation (WSSS). In\nimage-level WSSS, Class Activation Map (CAM) is adopted to generate object\nlocalization as pseudo segmentation labels. To address the partial activation\nissue of the CAMs, consistency regularization is employed to maintain\nactivation intensity invariance across various image augmentations. However,\nsuch methods ignore pair-wise relations among regions within each CAM, which\ncapture context and should also be invariant across image views. To this end,\nwe propose a new all-pairs consistency regularization (ACR). Given a pair of\naugmented views, our approach regularizes the activation intensities between a\npair of augmented views, while also ensuring that the affinity across regions\nwithin each view remains consistent. We adopt vision transformers as the\nself-attention mechanism naturally embeds pair-wise affinity. This enables us\nto simply regularize the distance between the attention matrices of augmented\nimage pairs. Additionally, we introduce a novel class-wise localization method\nthat leverages the gradients of the class token. Our method can be seamlessly\nintegrated into existing WSSS methods using transformers without modifying the\narchitectures. We evaluate our method on PASCAL VOC and MS COCO datasets. Our\nmethod produces noticeably better class localization maps (67.3% mIoU on PASCAL\nVOC train), resulting in superior WSSS performances.\n","authors":["Weixuan Sun","Yanhao Zhang","Zhen Qin","Zheyuan Liu","Lin Cheng","Fanyi Wang","Yiran Zhong","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2308.04321v1.pdf","comment":"ICCV 2023 workshop"},{"id":"http://arxiv.org/abs/2307.07873v3","updated":"2023-08-08T15:13:22Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v3.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.02781v2","updated":"2023-08-08T14:54:36Z","published":"2023-08-05T03:21:12Z","title":"A Voting-Stacking Ensemble of Inception Networks for Cervical Cytology\n  Classification","summary":"  Cervical cancer is one of the most severe diseases threatening women's\nhealth. Early detection and diagnosis can significantly reduce cancer risk, in\nwhich cervical cytology classification is indispensable. Researchers have\nrecently designed many networks for automated cervical cancer diagnosis, but\nthe limited accuracy and bulky size of these individual models cannot meet\npractical application needs. To address this issue, we propose a\nVoting-Stacking ensemble strategy, which employs three Inception networks as\nbase learners and integrates their outputs through a voting ensemble. The\nsamples misclassified by the ensemble model generate a new training set on\nwhich a linear classification model is trained as the meta-learner and performs\nthe final predictions. In addition, a multi-level Stacking ensemble framework\nis designed to improve performance further. The method is evaluated on the\nSIPakMed, Herlev, and Mendeley datasets, achieving accuracies of 100%, 100%,\nand 100%, respectively. The experimental results outperform the current\nstate-of-the-art (SOTA) methods, demonstrating its potential for reducing\nscreening workload and helping pathologists detect cervical cancer.\n","authors":["Linyi Qian","Qian Huang","Yulin Chen","Junzhou Chen"],"pdf_url":"https://arxiv.org/pdf/2308.02781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v2","updated":"2023-08-08T14:52:39Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.00500v2","updated":"2023-08-08T14:50:50Z","published":"2023-03-01T13:32:55Z","title":"Inherently Interpretable Multi-Label Classification Using Class-Specific\n  Counterfactuals","summary":"  Interpretability is essential for machine learning algorithms in high-stakes\napplication fields such as medical image analysis. However, high-performing\nblack-box neural networks do not provide explanations for their predictions,\nwhich can lead to mistrust and suboptimal human-ML collaboration. Post-hoc\nexplanation techniques, which are widely used in practice, have been shown to\nsuffer from severe conceptual problems. Furthermore, as we show in this paper,\ncurrent explanation techniques do not perform adequately in the multi-label\nscenario, in which multiple medical findings may co-occur in a single image. We\npropose Attri-Net, an inherently interpretable model for multi-label\nclassification. Attri-Net is a powerful classifier that provides transparent,\ntrustworthy, and human-understandable explanations. The model first generates\nclass-specific attribution maps based on counterfactuals to identify which\nimage regions correspond to certain medical findings. Then a simple logistic\nregression classifier is used to make predictions based solely on these\nattribution maps. We compare Attri-Net to five post-hoc explanation techniques\nand one inherently interpretable classifier on three chest X-ray datasets. We\nfind that Attri-Net produces high-quality multi-label explanations consistent\nwith clinical knowledge and has comparable classification performance to\nstate-of-the-art classification models.\n","authors":["Susu Sun","Stefano Woerner","Andreas Maier","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2303.00500v2.pdf","comment":"Accepted to MIDL 2023"},{"id":"http://arxiv.org/abs/2308.04303v1","updated":"2023-08-08T14:49:44Z","published":"2023-08-08T14:49:44Z","title":"Vehicle Motion Forecasting using Prior Information and Semantic-assisted\n  Occupancy Grid Maps","summary":"  Motion prediction is a challenging task for autonomous vehicles due to\nuncertainty in the sensor data, the non-deterministic nature of future, and\ncomplex behavior of agents. In this paper, we tackle this problem by\nrepresenting the scene as dynamic occupancy grid maps (DOGMs), associating\nsemantic labels to the occupied cells and incorporating map information. We\npropose a novel framework that combines deep-learning-based spatio-temporal and\nprobabilistic approaches to predict vehicle behaviors.Contrary to the\nconventional OGM prediction methods, evaluation of our work is conducted\nagainst the ground truth annotations. We experiment and validate our results on\nreal-world NuScenes dataset and show that our model shows superior ability to\npredict both static and dynamic vehicles compared to OGM predictions.\nFurthermore, we perform an ablation study and assess the role of semantic\nlabels and map in the architecture.\n","authors":["Rabbia Asghar","Manuel Diaz-Zapata","Lukas Rummelhard","Anne Spalanzani","Christian Laugier"],"pdf_url":"https://arxiv.org/pdf/2308.04303v1.pdf","comment":"Accepted to the 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2308.04288v1","updated":"2023-08-08T14:32:38Z","published":"2023-08-08T14:32:38Z","title":"Cloth2Tex: A Customized Cloth Texture Generation Pipeline for 3D Virtual\n  Try-On","summary":"  Fabricating and designing 3D garments has become extremely demanding with the\nincreasing need for synthesizing realistic dressed persons for a variety of\napplications, e.g. 3D virtual try-on, digitalization of 2D clothes into 3D\napparel, and cloth animation. It thus necessitates a simple and straightforward\npipeline to obtain high-quality texture from simple input, such as 2D reference\nimages. Since traditional warping-based texture generation methods require a\nsignificant number of control points to be manually selected for each type of\ngarment, which can be a time-consuming and tedious process. We propose a novel\nmethod, called Cloth2Tex, which eliminates the human burden in this process.\nCloth2Tex is a self-supervised method that generates texture maps with\nreasonable layout and structural consistency. Another key feature of Cloth2Tex\nis that it can be used to support high-fidelity texture inpainting. This is\ndone by combining Cloth2Tex with a prevailing latent diffusion model. We\nevaluate our approach both qualitatively and quantitatively and demonstrate\nthat Cloth2Tex can generate high-quality texture maps and achieve the best\nvisual effects in comparison to other methods. Project page:\ntomguluson92.github.io/projects/cloth2tex/\n","authors":["Daiheng Gao","Xu Chen","Xindi Zhang","Qi Wang","Ke Sun","Bang Zhang","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04288v1.pdf","comment":"15 pages, 15 figures"},{"id":"http://arxiv.org/abs/2212.04780v3","updated":"2023-08-08T14:30:05Z","published":"2022-12-09T11:18:40Z","title":"Genie: Show Me the Data for Quantization","summary":"  Zero-shot quantization is a promising approach for developing lightweight\ndeep neural networks when data is inaccessible owing to various reasons,\nincluding cost and issues related to privacy. By exploiting the learned\nparameters ($\\mu$ and $\\sigma$) of batch normalization layers in an\nFP32-pre-trained model, zero-shot quantization schemes focus on generating\nsynthetic data. Subsequently, they distill knowledge from the pre-trained model\n(teacher) to the quantized model (student) such that the quantized model can be\noptimized with the synthetic dataset. However, thus far, zero-shot quantization\nhas primarily been discussed in the context of quantization-aware training\nmethods, which require task-specific losses and long-term optimization as much\nas retraining. We thus introduce a post-training quantization scheme for\nzero-shot quantization that produces high-quality quantized networks within a\nfew hours. Furthermore, we propose a framework called Genie~that generates data\nsuited for quantization. With the data synthesized by Genie, we can produce\nrobust quantized models without real datasets, which is comparable to few-shot\nquantization. We also propose a post-training quantization algorithm to enhance\nthe performance of quantized models. By combining them, we can bridge the gap\nbetween zero-shot and few-shot quantization while significantly improving the\nquantization performance compared to that of existing approaches. In other\nwords, we can obtain a unique state-of-the-art zero-shot quantization approach.\nThe code is available at \\url{https://github.com/SamsungLabs/Genie}.\n","authors":["Yongkweon Jeon","Chungman Lee","Ho-young Kim"],"pdf_url":"https://arxiv.org/pdf/2212.04780v3.pdf","comment":"Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie"},{"id":"http://arxiv.org/abs/2308.04283v1","updated":"2023-08-08T14:25:13Z","published":"2023-08-08T14:25:13Z","title":"Vision-Based Autonomous Navigation for Unmanned Surface Vessel in\n  Extreme Marine Conditions","summary":"  Visual perception is an important component for autonomous navigation of\nunmanned surface vessels (USV), particularly for the tasks related to\nautonomous inspection and tracking. These tasks involve vision-based navigation\ntechniques to identify the target for navigation. Reduced visibility under\nextreme weather conditions in marine environments makes it difficult for\nvision-based approaches to work properly. To overcome these issues, this paper\npresents an autonomous vision-based navigation framework for tracking target\nobjects in extreme marine conditions. The proposed framework consists of an\nintegrated perception pipeline that uses a generative adversarial network (GAN)\nto remove noise and highlight the object features before passing them to the\nobject detector (i.e., YOLOv5). The detected visual features are then used by\nthe USV to track the target. The proposed framework has been thoroughly tested\nin simulation under extremely reduced visibility due to sandstorms and fog. The\nresults are compared with state-of-the-art de-hazing methods across the\nbenchmarked MBZIRC simulation dataset, on which the proposed scheme has\noutperformed the existing methods across various metrics.\n","authors":["Muhayyuddin Ahmed","Ahsan Baidar Bakht","Taimur Hassan","Waseem Akram","Ahmed Humais","Lakmal Seneviratne","Shaoming He","Defu Lin","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2308.04283v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots (IROS-2023)"},{"id":"http://arxiv.org/abs/2308.04269v1","updated":"2023-08-08T14:10:16Z","published":"2023-08-08T14:10:16Z","title":"Lossy and Lossless (L$^2$) Post-training Model Size Compression","summary":"  Deep neural networks have delivered remarkable performance and have been\nwidely used in various visual tasks. However, their huge size causes\nsignificant inconvenience for transmission and storage. Many previous studies\nhave explored model size compression. However, these studies often approach\nvarious lossy and lossless compression methods in isolation, leading to\nchallenges in achieving high compression ratios efficiently. This work proposes\na post-training model size compression method that combines lossy and lossless\ncompression in a unified way. We first propose a unified parametric weight\ntransformation, which ensures different lossy compression methods can be\nperformed jointly in a post-training manner. Then, a dedicated differentiable\ncounter is introduced to guide the optimization of lossy compression to arrive\nat a more suitable point for later lossless compression. Additionally, our\nmethod can easily control a desired global compression ratio and allocate\nadaptive ratios for different layers. Finally, our method can achieve a stable\n$10\\times$ compression ratio without sacrificing accuracy and a $20\\times$\ncompression ratio with minor accuracy loss in a short time. Our code is\navailable at https://github.com/ModelTC/L2_Compression .\n","authors":["Yumeng Shi","Shihao Bai","Xiuying Wei","Ruihao Gong","Jianlei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.04269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04262v1","updated":"2023-08-08T13:59:16Z","published":"2023-08-08T13:59:16Z","title":"SDLFormer: A Sparse and Dense Locality-enhanced Transformer for\n  Accelerated MR Image Reconstruction","summary":"  Transformers have emerged as viable alternatives to convolutional neural\nnetworks owing to their ability to learn non-local region relationships in the\nspatial domain. The self-attention mechanism of the transformer enables\ntransformers to capture long-range dependencies in the images, which might be\ndesirable for accelerated MRI image reconstruction as the effect of\nundersampling is non-local in the image domain. Despite its computational\nefficiency, the window-based transformers suffer from restricted receptive\nfields as the dependencies are limited to within the scope of the image\nwindows. We propose a window-based transformer network that integrates dilated\nattention mechanism and convolution for accelerated MRI image reconstruction.\nThe proposed network consists of dilated and dense neighborhood attention\ntransformers to enhance the distant neighborhood pixel relationship and\nintroduce depth-wise convolutions within the transformer module to learn\nlow-level translation invariant features for accelerated MRI image\nreconstruction. The proposed model is trained in a self-supervised manner. We\nperform extensive experiments for multi-coil MRI acceleration for coronal PD,\ncoronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in\nself-supervised learning based on k-space splitting. We compare our method\nagainst other reconstruction architectures and the parallel domain\nself-supervised learning baseline. Results show that the proposed model\nexhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in\nSSIM on average over other architectures (ii) around 1.44 dB in PSNR and around\n0.029 in SSIM over parallel domain self-supervised learning. The code is\navailable at https://github.com/rahul-gs-16/sdlformer.git\n","authors":["Rahul G. S.","Sriprabha Ramnarayanan","Mohammad Al Fahim","Keerthi Ram","Preejith S. P","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.04262v1.pdf","comment":"Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with\n  noisy and Limited Data"},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04252v1","updated":"2023-08-08T13:38:50Z","published":"2023-08-08T13:38:50Z","title":"Blur aware metric depth estimation with multi-focus plenoptic cameras","summary":"  While a traditional camera only captures one point of view of a scene, a\nplenoptic or light-field camera, is able to capture spatial and angular\ninformation in a single snapshot, enabling depth estimation from a single\nacquisition. In this paper, we present a new metric depth estimation algorithm\nusing only raw images from a multi-focus plenoptic camera. The proposed\napproach is especially suited for the multi-focus configuration where several\nmicro-lenses with different focal lengths are used. The main goal of our blur\naware depth estimation (BLADE) approach is to improve disparity estimation for\ndefocus stereo images by integrating both correspondence and defocus cues. We\nthus leverage blur information where it was previously considered a drawback.\nWe explicitly derive an inverse projection model including the defocus blur\nproviding depth estimates up to a scale factor. A method to calibrate the\ninverse model is then proposed. We thus take into account depth scaling to\nachieve precise and accurate metric depth estimates. Our results show that\nintroducing defocus cues improves the depth estimation. We demonstrate the\neffectiveness of our framework and depth scaling calibration on relative depth\nestimation setups and on real-world 3D complex scenes with ground truth\nacquired with a 3D lidar scanner.\n","authors":["Mathieu Labussière","Céline Teulière","Omar Ait-Aider"],"pdf_url":"https://arxiv.org/pdf/2308.04252v1.pdf","comment":"21 pages, 12 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2308.04249v1","updated":"2023-08-08T13:28:34Z","published":"2023-08-08T13:28:34Z","title":"MindDiffuser: Controlled Image Reconstruction from Human Brain Activity\n  with Semantic and Structural Diffusion","summary":"  Reconstructing visual stimuli from brain recordings has been a meaningful and\nchallenging task. Especially, the achievement of precise and controllable image\nreconstruction bears great significance in propelling the progress and\nutilization of brain-computer interfaces. Despite the advancements in complex\nimage reconstruction techniques, the challenge persists in achieving a cohesive\nalignment of both semantic (concepts and objects) and structure (position,\norientation, and size) with the image stimuli. To address the aforementioned\nissue, we propose a two-stage image reconstruction model called MindDiffuser.\nIn Stage 1, the VQ-VAE latent representations and the CLIP text embeddings\ndecoded from fMRI are put into Stable Diffusion, which yields a preliminary\nimage that contains semantic information. In Stage 2, we utilize the CLIP\nvisual feature decoded from fMRI as supervisory information, and continually\nadjust the two feature vectors decoded in Stage 1 through backpropagation to\nalign the structural information. The results of both qualitative and\nquantitative analyses demonstrate that our model has surpassed the current\nstate-of-the-art models on Natural Scenes Dataset (NSD). The subsequent\nexperimental findings corroborate the neurobiological plausibility of the\nmodel, as evidenced by the interpretability of the multimodal feature employed,\nwhich align with the corresponding brain responses.\n","authors":["Yizhuo Lu","Changde Du","Qiongyi zhou","Dianpeng Wang","Huiguang He"],"pdf_url":"https://arxiv.org/pdf/2308.04249v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14139"},{"id":"http://arxiv.org/abs/2308.04243v1","updated":"2023-08-08T13:17:20Z","published":"2023-08-08T13:17:20Z","title":"AICSD: Adaptive Inter-Class Similarity Distillation for Semantic\n  Segmentation","summary":"  In recent years, deep neural networks have achieved remarkable accuracy in\ncomputer vision tasks. With inference time being a crucial factor, particularly\nin dense prediction tasks such as semantic segmentation, knowledge distillation\nhas emerged as a successful technique for improving the accuracy of lightweight\nstudent networks. The existing methods often neglect the information in\nchannels and among different classes. To overcome these limitations, this paper\nproposes a novel method called Inter-Class Similarity Distillation (ICSD) for\nthe purpose of knowledge distillation. The proposed method transfers high-order\nrelations from the teacher network to the student network by independently\ncomputing intra-class distributions for each class from network outputs. This\nis followed by calculating inter-class similarity matrices for distillation\nusing KL divergence between distributions of each pair of classes. To further\nimprove the effectiveness of the proposed method, an Adaptive Loss Weighting\n(ALW) training strategy is proposed. Unlike existing methods, the ALW strategy\ngradually reduces the influence of the teacher network towards the end of\ntraining process to account for errors in teacher's predictions. Extensive\nexperiments conducted on two well-known datasets for semantic segmentation,\nCityscapes and Pascal VOC 2012, validate the effectiveness of the proposed\nmethod in terms of mIoU and pixel accuracy. The proposed method outperforms\nmost of existing knowledge distillation methods as demonstrated by both\nquantitative and qualitative evaluations. Code is available at:\nhttps://github.com/AmirMansurian/AICSD\n","authors":["Amir M. Mansourian","Rozhan Ahmadi","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2308.04243v1.pdf","comment":"10 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.09724v3","updated":"2023-08-08T13:14:26Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":"  To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v3.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n  https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2304.08134v3","updated":"2023-08-08T12:57:36Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n  Human-Machine Fusion Approach","summary":"  Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04232v1","updated":"2023-08-08T12:54:05Z","published":"2023-08-08T12:54:05Z","title":"A Comparative Study of Image-to-Image Translation Using GANs for\n  Synthetic Child Race Data","summary":"  The lack of ethnic diversity in data has been a limiting factor of face\nrecognition techniques in the literature. This is particularly the case for\nchildren where data samples are scarce and presents a challenge when seeking to\nadapt machine vision algorithms that are trained on adult data to work on\nchildren. This work proposes the utilization of image-to-image transformation\nto synthesize data of different races and thus adjust the ethnicity of\nchildren's face data. We consider ethnicity as a style and compare three\ndifferent Image-to-Image neural network based methods, specifically pix2pix,\nCycleGAN, and CUT networks to implement Caucasian child data and Asian child\ndata conversion. Experimental validation results on synthetic data demonstrate\nthe feasibility of using image-to-image transformation methods to generate\nvarious synthetic child data samples with broader ethnic diversity.\n","authors":["Wang Yao","Muhammad Ali Farooq","Joseph Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2308.04232v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2308.04224v1","updated":"2023-08-08T12:43:26Z","published":"2023-08-08T12:43:26Z","title":"Will your Doorbell Camera still recognize you as you grow old","summary":"  Robust authentication for low-power consumer devices such as doorbell cameras\nposes a valuable and unique challenge. This work explores the effect of age and\naging on the performance of facial authentication methods. Two public age\ndatasets, AgeDB and Morph-II have been used as baselines in this work. A\nphoto-realistic age transformation method has been employed to augment a set of\nhigh-quality facial images with various age effects. Then the effect of these\nsynthetic aging data on the high-performance deep-learning-based face\nrecognition model is quantified by using various metrics including Receiver\nOperating Characteristic (ROC) curves and match score distributions.\nExperimental results demonstrate that long-term age effects are still a\nsignificant challenge for the state-of-the-art facial authentication method.\n","authors":["Wang Yao","Muhammad Ali Farooq","Joseph Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2308.04224v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2308.04218v1","updated":"2023-08-08T12:30:36Z","published":"2023-08-08T12:30:36Z","title":"AquaSAM: Underwater Image Foreground Segmentation","summary":"  The Segment Anything Model (SAM) has revolutionized natural image\nsegmentation, nevertheless, its performance on underwater images is still\nrestricted. This work presents AquaSAM, the first attempt to extend the success\nof SAM on underwater images with the purpose of creating a versatile method for\nthe segmentation of various underwater targets. To achieve this, we begin by\nclassifying and extracting various labels automatically in SUIM dataset.\nSubsequently, we develop a straightforward fine-tuning method to adapt SAM to\ngeneral foreground underwater image segmentation. Through extensive experiments\ninvolving eight segmentation tasks like human divers, we demonstrate that\nAquaSAM outperforms the default SAM model especially at hard tasks like coral\nreefs. AquaSAM achieves an average Dice Similarity Coefficient (DSC) of 7.13\n(%) improvement and an average of 8.27 (%) on mIoU improvement in underwater\nsegmentation tasks.\n","authors":["Muduo Xu","Jianhao Su","Yutao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04207v1","updated":"2023-08-08T12:17:02Z","published":"2023-08-08T12:17:02Z","title":"Robust retrieval of material chemical states in X-ray microspectroscopy","summary":"  X-ray microspectroscopic techniques are essential for studying morphological\nand chemical changes in materials, providing high-resolution structural and\nspectroscopic information. However, its practical data analysis for reliably\nretrieving the chemical states remains a major obstacle to accelerating the\nfundamental understanding of materials in many research fields. In this work,\nwe propose a novel data formulation model for X-ray microspectroscopy and\ndevelop a dedicated unmixing framework to solve this problem, which is robust\nto noise and spectral variability. Moreover, this framework is not limited to\nthe analysis of two-state material chemistry, making it an effective\nalternative to conventional and widely-used methods. In addition, an\nalternative directional multiplier method with provable convergence is applied\nto obtain the solution efficiently. Our framework can accurately identify and\ncharacterize chemical states in complex and heterogeneous samples, even under\nchallenging conditions such as low signal-to-noise ratios and overlapping\nspectral features. Extensive experimental results on simulated and real\ndatasets demonstrate its effectiveness and reliability.\n","authors":["Ting Wang","Xiaotong Wu","Jizhou Li","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04207v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.04206v1","updated":"2023-08-08T12:12:30Z","published":"2023-08-08T12:12:30Z","title":"Exploring Transformers for Open-world Instance Segmentation","summary":"  Open-world instance segmentation is a rising task, which aims to segment all\nobjects in the image by learning from a limited number of base-category\nobjects. This task is challenging, as the number of unseen categories could be\nhundreds of times larger than that of seen categories. Recently, the DETR-like\nmodels have been extensively studied in the closed world while stay unexplored\nin the open world. In this paper, we utilize the Transformer for open-world\ninstance segmentation and present SWORD. Firstly, we introduce to attach the\nstop-gradient operation before classification head and further add IoU heads\nfor discovering novel objects. We demonstrate that a simple stop-gradient\noperation not only prevents the novel objects from being suppressed as\nbackground, but also allows the network to enjoy the merit of heuristic label\nassignment. Secondly, we propose a novel contrastive learning framework to\nenlarge the representations between objects and background. Specifically, we\nmaintain a universal object queue to obtain the object center, and dynamically\nselect positive and negative samples from the object queries for contrastive\nlearning. While the previous works only focus on pursuing average recall and\nneglect average precision, we show the prominence of SWORD by giving\nconsideration to both criteria. Our models achieve state-of-the-art performance\nin various open-world cross-category and cross-dataset generalizations.\nParticularly, in VOC to non-VOC setup, our method sets new state-of-the-art\nresults of 40.0% on ARb100 and 34.9% on ARm100. For COCO to UVO generalization,\nSWORD significantly outperforms the previous best open-world model by 5.9% on\nAPm and 8.1% on ARm100.\n","authors":["Jiannan Wu","Yi Jiang","Bin Yan","Huchuan Lu","Zehuan Yuan","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.04206v1.pdf","comment":"Accepted by ICCV2023. 16 pages"},{"id":"http://arxiv.org/abs/2302.00290v2","updated":"2023-08-08T11:59:25Z","published":"2023-02-01T07:45:10Z","title":"MS-DETR: Multispectral Pedestrian Detection Transformer with Loosely\n  Coupled Fusion and Modality-Balanced Optimization","summary":"  Multispectral pedestrian detection is an important task for many\naround-the-clock applications, since the visible and thermal modalities can\nprovide complementary information especially under low light conditions. Most\nof the available multispectral pedestrian detectors are based on non-end-to-end\ndetectors, while in this paper, we propose MultiSpectral pedestrian DEtection\nTRansformer (MS-DETR), an end-to-end multispectral pedestrian detector, which\nextends DETR into the field of multi-modal detection. MS-DETR consists of two\nmodality-specific backbones and Transformer encoders, followed by a multi-modal\nTransformer decoder, and the visible and thermal features are fused in the\nmulti-modal Transformer decoder. To well resist the misalignment between\nmulti-modal images, we design a loosely coupled fusion strategy by sparsely\nsampling some keypoints from multi-modal features independently and fusing them\nwith adaptively learned attention weights. Moreover, based on the insight that\nnot only different modalities, but also different pedestrian instances tend to\nhave different confidence scores to final detection, we further propose an\ninstance-aware modality-balanced optimization strategy, which preserves visible\nand thermal decoder branches and aligns their predicted slots through an\ninstance-wise dynamic loss. Our end-to-end MS-DETR shows superior performance\non the challenging KAIST, CVC-14 and LLVIP benchmark datasets. The source code\nis available at https://github.com/YinghuiXing/MS-DETR .\n","authors":["Yinghui Xing","Song Wang","Shizhou Zhang","Guoqiang Liang","Xiuwei Zhang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04197v1","updated":"2023-08-08T11:49:04Z","published":"2023-08-08T11:49:04Z","title":"D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with\n  Glance Annotation","summary":"  Temporal sentence grounding (TSG) aims to locate a specific moment from an\nuntrimmed video with a given natural language query. Recently, weakly\nsupervised methods still have a large performance gap compared to fully\nsupervised ones, while the latter requires laborious timestamp annotations. In\nthis study, we aim to reduce the annotation cost yet keep competitive\nperformance for TSG task compared to fully supervised ones. To achieve this\ngoal, we investigate a recently proposed glance-supervised temporal sentence\ngrounding task, which requires only single frame annotation (referred to as\nglance annotation) for each query. Under this setup, we propose a Dynamic\nGaussian prior based Grounding framework with Glance annotation (D3G), which\nconsists of a Semantic Alignment Group Contrastive Learning module (SA-GCL) and\na Dynamic Gaussian prior Adjustment module (DGA). Specifically, SA-GCL samples\nreliable positive moments from a 2D temporal map via jointly leveraging\nGaussian prior and semantic consistency, which contributes to aligning the\npositive sentence-moment pairs in the joint embedding space. Moreover, to\nalleviate the annotation bias resulting from glance annotation and model\ncomplex queries consisting of multiple events, we propose the DGA module, which\nadjusts the distribution dynamically to approximate the ground truth of target\nmoments. Extensive experiments on three challenging benchmarks verify the\neffectiveness of the proposed D3G. It outperforms the state-of-the-art weakly\nsupervised methods by a large margin and narrows the performance gap compared\nto fully supervised methods. Code is available at\nhttps://github.com/solicucu/D3G.\n","authors":["Hanjun Li","Xiujun Shu","Sunan He","Ruizhi Qiao","Wei Wen","Taian Guo","Bei Gan","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04197v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09788v2","updated":"2023-08-08T11:36:26Z","published":"2023-07-19T07:11:45Z","title":"Density-invariant Features for Distant Point Cloud Registration","summary":"  Registration of distant outdoor LiDAR point clouds is crucial to extending\nthe 3D vision of collaborative autonomous vehicles, and yet is challenging due\nto small overlapping area and a huge disparity between observed point\ndensities. In this paper, we propose Group-wise Contrastive Learning (GCL)\nscheme to extract density-invariant geometric features to register distant\noutdoor LiDAR point clouds. We mark through theoretical analysis and\nexperiments that, contrastive positives should be independent and identically\ndistributed (i.i.d.), in order to train densityinvariant feature extractors. We\npropose upon the conclusion a simple yet effective training scheme to force the\nfeature of multiple point clouds in the same spatial location (referred to as\npositive groups) to be similar, which naturally avoids the sampling bias\nintroduced by a pair of point clouds to conform with the i.i.d. principle. The\nresulting fully-convolutional feature extractor is more powerful and\ndensity-invariant than state-of-the-art methods, improving the registration\nrecall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and\n26.9%, respectively. Code is available at https://github.com/liuQuan98/GCL.\n","authors":["Quan Liu","Hongzi Zhu","Yunsong Zhou","Hongyang Li","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2307.09788v2.pdf","comment":"In Proceedings of the IEEE/CVF International Conference on Computer\n  Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.04188v1","updated":"2023-08-08T11:23:56Z","published":"2023-08-08T11:23:56Z","title":"Image Copy-Move Forgery Detection via Deep Cross-Scale PatchMatch","summary":"  The recently developed deep algorithms achieve promising progress in the\nfield of image copy-move forgery detection (CMFD). However, they have limited\ngeneralizability in some practical scenarios, where the copy-move objects may\nnot appear in the training images or cloned regions are from the background. To\naddress the above issues, in this work, we propose a novel end-to-end CMFD\nframework by integrating merits from both conventional and deep methods.\nSpecifically, we design a deep cross-scale patchmatch method tailored for CMFD\nto localize copy-move regions. In contrast to existing deep models, our scheme\naims to seek explicit and reliable point-to-point matching between source and\ntarget regions using features extracted from high-resolution scales. Further,\nwe develop a manipulation region location branch for source/target separation.\nThe proposed CMFD framework is completely differentiable and can be trained in\nan end-to-end manner. Extensive experimental results demonstrate the high\ngeneralizability of our method to different copy-move contents, and the\nproposed scheme achieves significantly better performance than existing\napproaches.\n","authors":["Yingjie He","Yuanman Li","Changsheng Chen","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.04188v1.pdf","comment":"6 pages, 4 figures, accepted by ICME2023"},{"id":"http://arxiv.org/abs/2209.14915v2","updated":"2023-08-08T10:30:54Z","published":"2022-09-29T16:22:46Z","title":"Spiking Neural Networks for event-based action recognition: A new task\n  to understand their advantage","summary":"  Spiking Neural Networks (SNN) are characterised by their unique temporal\ndynamics, but the properties and advantages of such computations are still not\nwell understood. In order to provide answers, in this work we demonstrate how\nSpiking neurons can enable temporal feature extraction in feed-forward neural\nnetworks without the need for recurrent synapses, showing how their\nbio-inspired computing principles can be successfully exploited beyond energy\nefficiency gains and evidencing their differences with respect to conventional\nneurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain\n(DVS-GC), which allows, for the first time, to evaluate the perception of\ntemporal dependencies in a real event-based action recognition dataset. Our\nstudy proves how the widely used DVS Gesture benchmark could be solved by\nnetworks without temporal feature extraction, unlike the new DVS-GC which\ndemands an understanding of the ordering of the events. Furthermore, this setup\nallowed us to unveil the role of the leakage rate in spiking neurons for\ntemporal processing tasks and demonstrated the benefits of \"hard reset\"\nmechanisms. Additionally, we also show how time-dependent weights and\nnormalization can lead to understanding order by means of temporal attention.\n","authors":["Alex Vicente-Sola","Davide L. Manna","Paul Kirkland","Gaetano Di Caterina","Trevor Bihl"],"pdf_url":"https://arxiv.org/pdf/2209.14915v2.pdf","comment":"New article superseding the one in previous versions"},{"id":"http://arxiv.org/abs/2308.04177v1","updated":"2023-08-08T10:30:34Z","published":"2023-08-08T10:30:34Z","title":"How Generalizable are Deepfake Detectors? An Empirical Study","summary":"  Deepfake videos and images are becoming increasingly credible, posing a\nsignificant threat given their potential to facilitate fraud or bypass access\ncontrol systems. This has motivated the development of deepfake detection\nmethods, in which deep learning models are trained to distinguish between real\nand synthesized footage. Unfortunately, existing detection models struggle to\ngeneralize to deepfakes from datasets they were not trained on, but little work\nhas been done to examine why or how this limitation can be addressed. In this\npaper, we present the first empirical study on the generalizability of deepfake\ndetectors, an essential goal for detectors to stay one step ahead of attackers.\nOur study utilizes six deepfake datasets, five deepfake detection methods, and\ntwo model augmentation approaches, confirming that detectors do not generalize\nin zero-shot settings. Additionally, we find that detectors are learning\nunwanted properties specific to synthesis methods and struggling to extract\ndiscriminative features, limiting their ability to generalize. Finally, we find\nthat there are neurons universally contributing to detection across seen and\nunseen datasets, illuminating a possible path forward to zero-shot\ngeneralizability.\n","authors":["Boquan Li","Jun Sun","Christopher M. Poskitt"],"pdf_url":"https://arxiv.org/pdf/2308.04177v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2301.10227v2","updated":"2023-08-08T10:18:04Z","published":"2023-01-02T14:17:08Z","title":"Denoising Diffusion Probabilistic Models for Generation of Realistic\n  Fully-Annotated Microscopy Image Data Sets","summary":"  Recent advances in computer vision have led to significant progress in the\ngeneration of realistic image data, with denoising diffusion probabilistic\nmodels proving to be a particularly effective method. In this study, we\ndemonstrate that diffusion models can effectively generate fully-annotated\nmicroscopy image data sets through an unsupervised and intuitive approach,\nusing rough sketches of desired structures as the starting point. The proposed\npipeline helps to reduce the reliance on manual annotations when training deep\nlearning-based segmentation approaches and enables the segmentation of diverse\ndatasets without the need for human annotations. This approach holds great\npromise in streamlining the data generation process and enabling a more\nefficient and scalable training of segmentation models, as we show in the\nexample of different practical experiments involving various organisms and cell\ntypes.\n","authors":["Dennis Eschweiler","Rüveyda Yilmaz","Matisse Baumann","Ina Laube","Rijo Roy","Abin Jose","Daniel Brückner","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2301.10227v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.05609v4","updated":"2023-08-08T10:04:14Z","published":"2023-01-13T15:24:40Z","title":"Co-manipulation of soft-materials estimating deformation from depth\n  images","summary":"  Human-robot co-manipulation of soft materials, such as fabrics, composites,\nand sheets of paper/cardboard, is a challenging operation that presents several\nrelevant industrial applications. Estimating the deformation state of the\nco-manipulated material is one of the main challenges. Viable methods provide\nthe indirect measure by calculating the human-robot relative distance. In this\npaper, we develop a data-driven model to estimate the deformation state of the\nmaterial from a depth image through a Convolutional Neural Network (CNN).\nFirst, we define the deformation state of the material as the relative\nroto-translation from the current robot pose and a human grasping position. The\nmodel estimates the current deformation state through a Convolutional Neural\nNetwork, specifically a DenseNet-121 pretrained on ImageNet.The delta between\nthe current and the desired deformation state is fed to the robot controller\nthat outputs twist commands. The paper describes the developed approach to\nacquire, preprocess the dataset and train the model. The model is compared with\nthe current state-of-the-art method based on a skeletal tracker from cameras.\nResults show that our approach achieves better performances and avoids the\nvarious drawbacks caused by using a skeletal tracker.Finally, we also studied\nthe model performance according to different architectures and dataset\ndimensions to minimize the time required for dataset acquisition\n","authors":["Giorgio Nicola","Enrico Villagrossi","Nicola Pedrocchi"],"pdf_url":"https://arxiv.org/pdf/2301.05609v4.pdf","comment":"Pre-print, Accepted to Robotics and Computer Integrated Manufacturing"},{"id":"http://arxiv.org/abs/2308.04168v1","updated":"2023-08-08T09:58:22Z","published":"2023-08-08T09:58:22Z","title":"EFaR 2023: Efficient Face Recognition Competition","summary":"  This paper presents the summary of the Efficient Face Recognition Competition\n(EFaR) held at the 2023 International Joint Conference on Biometrics (IJCB\n2023). The competition received 17 submissions from 6 different teams. To drive\nfurther development of efficient face recognition models, the submitted\nsolutions are ranked based on a weighted score of the achieved verification\naccuracies on a diverse set of benchmarks, as well as the deployability given\nby the number of floating-point operations and model size. The evaluation of\nsubmissions is extended to bias, cross-quality, and large-scale recognition\nbenchmarks. Overall, the paper gives an overview of the achieved performance\nvalues of the submitted solutions as well as a diverse set of baselines. The\nsubmitted solutions use small, efficient network architectures to reduce the\ncomputational cost, some solutions apply model quantization. An outlook on\npossible techniques that are underrepresented in current solutions is given as\nwell.\n","authors":["Jan Niklas Kolf","Fadi Boutros","Jurek Elliesen","Markus Theuerkauf","Naser Damer","Mohamad Alansari","Oussama Abdul Hay","Sara Alansari","Sajid Javed","Naoufel Werghi","Klemen Grm","Vitomir Štruc","Fernando Alonso-Fernandez","Kevin Hernandez Diaz","Josef Bigun","Anjith George","Christophe Ecabert","Hatef Otroshi Shahreza","Ketan Kotwal","Sébastien Marcel","Iurii Medvedev","Bo Jin","Diogo Nunes","Ahmad Hassanpour","Pankaj Khatiwada","Aafan Ahmad Toor","Bian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.04168v1.pdf","comment":"Accepted at IJCB 2023"},{"id":"http://arxiv.org/abs/2308.04163v1","updated":"2023-08-08T09:50:44Z","published":"2023-08-08T09:50:44Z","title":"Under-Display Camera Image Restoration with Scattering Effect","summary":"  The under-display camera (UDC) provides consumers with a full-screen visual\nexperience without any obstruction due to notches or punched holes. However,\nthe semi-transparent nature of the display inevitably introduces the severe\ndegradation into UDC images. In this work, we address the UDC image restoration\nproblem with the specific consideration of the scattering effect caused by the\ndisplay. We explicitly model the scattering effect by treating the display as a\npiece of homogeneous scattering medium. With the physical model of the\nscattering effect, we improve the image formation pipeline for the image\nsynthesis to construct a realistic UDC dataset with ground truths. To suppress\nthe scattering effect for the eventual UDC image recovery, a two-branch\nrestoration network is designed. More specifically, the scattering branch\nleverages global modeling capabilities of the channel-wise self-attention to\nestimate parameters of the scattering effect from degraded images. While the\nimage branch exploits the local representation advantage of CNN to recover\nclear scenes, implicitly guided by the scattering branch. Extensive experiments\nare conducted on both real-world and synthesized data, demonstrating the\nsuperiority of the proposed method over the state-of-the-art UDC restoration\ntechniques. The source code and dataset are available at\n\\url{https://github.com/NamecantbeNULL/SRUDC}.\n","authors":["Binbin Song","Xiangyu Chen","Shuning Xu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.04163v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.04162v1","updated":"2023-08-08T09:48:00Z","published":"2023-08-08T09:48:00Z","title":"EPCFormer: Expression Prompt Collaboration Transformer for Universal\n  Referring Video Object Segmentation","summary":"  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object\nSegmentation (R-VOS) are two highly-related tasks, which both aim to segment\nspecific objects from video sequences according to user-provided expression\nprompts. However, due to the challenges in modeling representations for\ndifferent modalities, contemporary methods struggle to strike a balance between\ninteraction flexibility and high-precision localization and segmentation. In\nthis paper, we address this problem from two perspectives: the alignment\nrepresentation of audio and text and the deep interaction among audio, text,\nand visual features. First, we propose a universal architecture, the Expression\nPrompt Collaboration Transformer, herein EPCFormer. Next, we propose an\nExpression Alignment (EA) mechanism for audio and text expressions. By\nintroducing contrastive learning for audio and text expressions, the proposed\nEPCFormer realizes comprehension of the semantic equivalence between audio and\ntext expressions denoting the same objects. Then, to facilitate deep\ninteractions among audio, text, and video features, we introduce an\nExpression-Visual Attention (EVA) mechanism. The knowledge of video object\nsegmentation in terms of the expression prompts can seamlessly transfer between\nthe two tasks by deeply exploring complementary cues between text and audio.\nExperiments on well-recognized benchmarks demonstrate that our universal\nEPCFormer attains state-of-the-art results on both tasks. The source code of\nEPCFormer will be made publicly available at\nhttps://github.com/lab206/EPCFormer.\n","authors":["Jiajun Chen","Jiacheng Lin","Zhiqiang Xiao","Haolong Fu","Ke Nai","Kailun Yang","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2308.04162v1.pdf","comment":"The source code will be made publicly available at\n  https://github.com/lab206/EPCFormer"},{"id":"http://arxiv.org/abs/2306.10046v2","updated":"2023-08-08T09:46:21Z","published":"2023-06-12T08:21:50Z","title":"Document Layout Annotation: Database and Benchmark in the Domain of\n  Public Affairs","summary":"  Every day, thousands of digital documents are generated with useful\ninformation for companies, public organizations, and citizens. Given the\nimpossibility of processing them manually, the automatic processing of these\ndocuments is becoming increasingly necessary in certain sectors. However, this\ntask remains challenging, since in most cases a text-only based parsing is not\nenough to fully understand the information presented through different\ncomponents of varying significance. In this regard, Document Layout Analysis\n(DLA) has been an interesting research field for many years, which aims to\ndetect and classify the basic components of a document. In this work, we used a\nprocedure to semi-automatically annotate digital documents with different\nlayout labels, including 4 basic layout blocks and 4 text categories. We apply\nthis procedure to collect a novel database for DLA in the public affairs\ndomain, using a set of 24 data sources from the Spanish Administration. The\ndatabase comprises 37.9K documents with more than 441K document pages, and more\nthan 8M labels associated to 8 layout block units. The results of our\nexperiments validate the proposed text labeling procedure with accuracy up to\n99%.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Marcos Grande","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.10046v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for\n  Document Analysis"},{"id":"http://arxiv.org/abs/2308.04156v1","updated":"2023-08-08T09:37:18Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n  Attention","summary":"  Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04152v1","updated":"2023-08-08T09:32:43Z","published":"2023-08-08T09:32:43Z","title":"Empowering Vision-Language Models to Follow Interleaved Vision-Language\n  Instructions","summary":"  Multimodal Large Language Models (MLLMs) have recently sparked significant\ninterest, which demonstrates emergent capabilities to serve as a\ngeneral-purpose model for various vision-language tasks. However, existing\nmethods mainly focus on limited types of instructions with a single image as\nvisual context, which hinders the widespread availability of MLLMs. In this\npaper, we introduce the I4 benchmark to comprehensively evaluate the\ninstruction following ability on complicated interleaved vision-language\ninstructions, which involve intricate image-text sequential context, covering a\ndiverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture\nslides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a\ncommon defect of existing methods: the Visual Prompt Generator (VPG) trained on\nimage-captioning alignment objective tends to attend to common foreground\ninformation for captioning but struggles to extract specific information\nrequired by particular tasks. To address this issue, we propose a generic and\nlightweight controllable knowledge re-injection module, which utilizes the\nsophisticated reasoning ability of LLMs to control the VPG to conditionally\nextract instruction-specific visual information and re-inject it into the LLM.\nFurther, we introduce an annotation-free cross-attention guided counterfactual\nimage training strategy to methodically learn the proposed module by\ncollaborating a cascade of foundation models. Enhanced by the proposed module\nand training strategy, we present Cheetah, a MLLM that can effectively handle a\nwide variety of interleaved vision-language instructions and achieves\nstate-of-the-art zero-shot performance across all tasks of I4, without\nhigh-quality multimodal instruction tuning data. Moreover, Cheetah also\nexhibits competitive performance compared with state-of-the-art instruction\ntuned models on concurrent MME benchmark.\n","authors":["Juncheng Li","Kaihang Pan","Zhiqi Ge","Minghe Gao","Hanwang Zhang","Wei Ji","Wenqiao Zhang","Tat-Seng Chua","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.04152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04151v1","updated":"2023-08-08T09:32:15Z","published":"2023-08-08T09:32:15Z","title":"Application for White Spot Syndrome Virus (WSSV) Monitoring using Edge\n  Machine Learning","summary":"  The aquaculture industry, strongly reliant on shrimp exports, faces\nchallenges due to viral infections like the White Spot Syndrome Virus (WSSV)\nthat severely impact output yields. In this context, computer vision can play a\nsignificant role in identifying features not immediately evident to skilled or\nuntrained eyes, potentially reducing the time required to report WSSV\ninfections. In this study, the challenge of limited data for WSSV recognition\nwas addressed. A mobile application dedicated to data collection and monitoring\nwas developed to facilitate the creation of an image dataset to train a WSSV\nrecognition model and improve country-wide disease surveillance. The study also\nincludes a thorough analysis of WSSV recognition to address the challenge of\nimbalanced learning and on-device inference. The models explored,\nMobileNetV3-Small and EfficientNetV2-B0, gained an F1-Score of 0.72 and 0.99\nrespectively. The saliency heatmaps of both models were also observed to\nuncover the \"black-box\" nature of these models and to gain insight as to what\nfeatures in the images are most important in making a prediction. These results\nhighlight the effectiveness and limitations of using models designed for\nresource-constrained devices and balancing their performance in accurately\nrecognizing WSSV, providing valuable information and direction in the use of\ncomputer vision in this domain.\n","authors":["Lorenzo S. Querol","Macario O. Cordel II","Dan Jeric A. Rustia","Mary Nia M. Santos"],"pdf_url":"https://arxiv.org/pdf/2308.04151v1.pdf","comment":"6 pages, 7 figures, conference"},{"id":"http://arxiv.org/abs/2308.02632v2","updated":"2023-08-08T09:21:40Z","published":"2023-08-04T17:44:27Z","title":"Generation of Realistic Synthetic Raw Radar Data for Automated Driving\n  Applications using Generative Adversarial Networks","summary":"  The main approaches for simulating FMCW radar are based on ray tracing, which\nis usually computationally intensive and do not account for background noise.\nThis work proposes a faster method for FMCW radar simulation capable of\ngenerating synthetic raw radar data using generative adversarial networks\n(GAN). The code and pre-trained weights are open-source and available on\nGitHub. This method generates 16 simultaneous chirps, which allows the\ngenerated data to be used for the further development of algorithms for\nprocessing radar data (filtering and clustering). This can increase the\npotential for data augmentation, e.g., by generating data in non-existent or\nsafety-critical scenarios that are not reproducible in real life. In this work,\nthe GAN was trained with radar measurements of a motorcycle and used to\ngenerate synthetic raw radar data of a motorcycle traveling in a straight line.\nFor generating this data, the distance of the motorcycle and Gaussian noise are\nused as input to the neural network. The synthetic generated radar chirps were\nevaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth\n(RA) map is calculated twice: first, based on synthetic data using this GAN\nand, second, based on real data. Based on these RA maps, an algorithm with\nadaptive threshold and edge detection is used for object detection. The results\nhave shown that the data is realistic in terms of coherent radar reflections of\nthe motorcycle and background noise based on the comparison of chirps, the RA\nmaps and the object detection results. Thus, the proposed method in this work\nhas shown to minimize the simulation-to-reality gap for the generation of radar\ndata.\n","authors":["Eduardo C. Fidelis","Fabio Reway","Herick Y. S. Ribeiro","Pietro L. Campos","Werner Huber","Christian Icking","Lester A. Faria","Torsten Schön"],"pdf_url":"https://arxiv.org/pdf/2308.02632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2002.03729v3","updated":"2023-08-08T09:18:57Z","published":"2020-01-16T09:38:50Z","title":"A lightweight target detection algorithm based on Mobilenet Convolution","summary":"  Target detection algorithm based on deep learning needs high computer GPU\nconfiguration, even need to use high performance deep learning workstation,\nthis not only makes the cost increase, also greatly limits the realizability of\nthe ground, this paper introduces a kind of lightweight algorithm for target\ndetection under the condition of the balance accuracy and computational\nefficiency, MobileNet as Backbone performs parameter The processing speed is\n30fps on the RTX2060 card for images with the CNN separator layer. The\nprocessing speed is 30fps on the RTX2060 card for images with a resolution of\n320*320.\n","authors":["Shengquan Wang"],"pdf_url":"https://arxiv.org/pdf/2002.03729v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04142v1","updated":"2023-08-08T09:03:46Z","published":"2023-08-08T09:03:46Z","title":"Class-level Structural Relation Modelling and Smoothing for Visual\n  Representation Learning","summary":"  Representation learning for images has been advanced by recent progress in\nmore complex neural models such as the Vision Transformers and new learning\ntheories such as the structural causal models. However, these models mainly\nrely on the classification loss to implicitly regularize the class-level data\ndistributions, and they may face difficulties when handling classes with\ndiverse visual patterns. We argue that the incorporation of the structural\ninformation between data samples may improve this situation. To achieve this\ngoal, this paper presents a framework termed \\textbf{C}lass-level Structural\nRelation Modeling and Smoothing for Visual Representation Learning (CSRMS),\nwhich includes the Class-level Relation Modelling, Class-aware Graph Sampling,\nand Relational Graph-Guided Representation Learning modules to model a\nrelational graph of the entire dataset and perform class-aware smoothing and\nregularization operations to alleviate the issue of intra-class visual\ndiversity and inter-class similarity. Specifically, the Class-level Relation\nModelling module uses a clustering algorithm to learn the data distributions in\nthe feature space and identify three types of class-level sample relations for\nthe training set; Class-aware Graph Sampling module extends typical training\nbatch construction process with three strategies to sample dataset-level\nsub-graphs; and Relational Graph-Guided Representation Learning module employs\na graph convolution network with knowledge-guided smoothing operations to ease\nthe projection from different visual patterns to the same class. Experiments\ndemonstrate the effectiveness of structured knowledge modelling for enhanced\nrepresentation learning and show that CSRMS can be incorporated with any\nstate-of-the-art visual representation learning models for performance gains.\nThe source codes and demos have been released at\nhttps://github.com/czt117/CSRMS.\n","authors":["Zitan Chen","Zhuang Qi","Xiao Cao","Xiangxian Li","Xiangxu Meng","Lei Meng"],"pdf_url":"https://arxiv.org/pdf/2308.04142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04137v1","updated":"2023-08-08T08:50:27Z","published":"2023-08-08T08:50:27Z","title":"Comprehensive Assessment of the Performance of Deep Learning Classifiers\n  Reveals a Surprising Lack of Robustness","summary":"  Reliable and robust evaluation methods are a necessary first step towards\ndeveloping machine learning models that are themselves robust and reliable.\nUnfortunately, current evaluation protocols typically used to assess\nclassifiers fail to comprehensively evaluate performance as they tend to rely\non limited types of test data, and ignore others. For example, using the\nstandard test data fails to evaluate the predictions made by the classifier to\nsamples from classes it was not trained on. On the other hand, testing with\ndata containing samples from unknown classes fails to evaluate how well the\nclassifier can predict the labels for known classes. This article advocates\nbench-marking performance using a wide range of different types of data and\nusing a single metric that can be applied to all such data types to produce a\nconsistent evaluation of performance. Using such a benchmark it is found that\ncurrent deep neural networks, including those trained with methods that are\nbelieved to produce state-of-the-art robustness, are extremely vulnerable to\nmaking mistakes on certain types of data. This means that such models will be\nunreliable in real-world scenarios where they may encounter data from many\ndifferent domains, and that they are insecure as they can easily be fooled into\nmaking the wrong decisions. It is hoped that these results will motivate the\nwider adoption of more comprehensive testing methods that will, in turn, lead\nto the development of more robust machine learning methods in the future.\n  Code is available at:\n\\url{https://codeberg.org/mwspratling/RobustnessEvaluation}\n","authors":["Michael W. Spratling"],"pdf_url":"https://arxiv.org/pdf/2308.04137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18651v3","updated":"2023-08-08T08:48:48Z","published":"2023-05-29T23:06:05Z","title":"UMD: Unsupervised Model Detection for X2X Backdoor Attacks","summary":"  Backdoor (Trojan) attack is a common threat to deep neural networks, where\nsamples from one or more source classes embedded with a backdoor trigger will\nbe misclassified to adversarial target classes. Existing methods for detecting\nwhether a classifier is backdoor attacked are mostly designed for attacks with\na single adversarial target (e.g., all-to-one attack). To the best of our\nknowledge, without supervision, no existing methods can effectively address the\nmore general X2X attack with an arbitrary number of source classes, each paired\nwith an arbitrary target class. In this paper, we propose UMD, the first\nUnsupervised Model Detection method that effectively detects X2X backdoor\nattacks via a joint inference of the adversarial (source, target) class pairs.\nIn particular, we first define a novel transferability statistic to measure and\nselect a subset of putative backdoor class pairs based on a proposed clustering\napproach. Then, these selected class pairs are jointly assessed based on an\naggregation of their reverse-engineered trigger size for detection inference,\nusing a robust and unsupervised anomaly detector we proposed. We conduct\ncomprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show\nthat our unsupervised UMD outperforms SOTA detectors (even with supervision) by\n17%, 4%, and 8%, respectively, in terms of the detection accuracy against\ndiverse X2X attacks. We also show the strong detection performance of UMD\nagainst several strong adaptive attacks.\n","authors":["Zhen Xiang","Zidi Xiong","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2305.18651v3.pdf","comment":"Proceedings of the 40th International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.14508v3","updated":"2023-08-08T08:29:12Z","published":"2022-08-30T19:32:07Z","title":"Swin-transformer-yolov5 For Real-time Wine Grape Bunch Detection","summary":"  In this research, an integrated detection model, Swin-transformer-YOLOv5 or\nSwin-T-YOLOv5, was proposed for real-time wine grape bunch detection to inherit\nthe advantages from both YOLOv5 and Swin-transformer. The research was\nconducted on two different grape varieties of Chardonnay (always white berry\nskin) and Merlot (white or white-red mix berry skin when immature; red when\nmatured) from July to September in 2019. To verify the superiority of\nSwin-T-YOLOv5, its performance was compared against several commonly\nused/competitive object detectors, including Faster R-CNN, YOLOv3, YOLOv4, and\nYOLOv5. All models were assessed under different test conditions, including two\ndifferent weather conditions (sunny and cloudy), two different berry maturity\nstages (immature and mature), and three different sunlight\ndirections/intensities (morning, noon, and afternoon) for a comprehensive\ncomparison. Additionally, the predicted number of grape bunches by\nSwin-T-YOLOv5 was further compared with ground truth values, including both\nin-field manual counting and manual labeling during the annotation process.\nResults showed that the proposed Swin-T-YOLOv5 outperformed all other studied\nmodels for grape bunch detection, with up to 97% of mean Average Precision\n(mAP) and 0.89 of F1-score when the weather was cloudy. This mAP was\napproximately 44%, 18%, 14%, and 4% greater than Faster R-CNN, YOLOv3, YOLOv4,\nand YOLOv5, respectively. Swin-T-YOLOv5 achieved its lowest mAP (90%) and\nF1-score (0.82) when detecting immature berries, where the mAP was\napproximately 40%, 5%, 3%, and 1% greater than the same. Furthermore,\nSwin-T-YOLOv5 performed better on Chardonnay variety with achieved up to 0.91\nof R2 and 2.36 root mean square error (RMSE) when comparing the predictions\nwith ground truth. However, it underperformed on Merlot variety with achieved\nonly up to 0.70 of R2 and 3.30 of RMSE.\n","authors":["Shenglian Lu","Xiaoyu Liu","Zixaun He","Wenbo Liu","Xin Zhang","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2208.14508v3.pdf","comment":"30 pages; 15 figures;Corresponding author: Xin Zhang Department of\n  Agricultural and Biological Engineering Mississippi State University\n  Mississippi State, MS 39762, USA (xzhang@abe.msstate.edu)"},{"id":"http://arxiv.org/abs/2301.11514v4","updated":"2023-08-08T08:26:20Z","published":"2023-01-27T03:18:09Z","title":"Deep Industrial Image Anomaly Detection: A Survey","summary":"  The recent rapid development of deep learning has laid a milestone in\nindustrial Image Anomaly Detection (IAD). In this paper, we provide a\ncomprehensive review of deep learning-based image anomaly detection techniques,\nfrom the perspectives of neural network architectures, levels of supervision,\nloss functions, metrics and datasets. In addition, we extract the new setting\nfrom industrial manufacturing and review the current IAD approaches under our\nproposed our new setting. Moreover, we highlight several opening challenges for\nimage anomaly detection. The merits and downsides of representative network\narchitectures under varying supervision are discussed. Finally, we summarize\nthe research findings and point out future research directions. More resources\nare available at\nhttps://github.com/M-3LAB/awesome-industrial-anomaly-detection.\n","authors":["Jiaqi Liu","Guoyang Xie","Jingbao Wang","Shangnian Li","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.11514v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04118v1","updated":"2023-08-08T08:17:39Z","published":"2023-08-08T08:17:39Z","title":"Multimodal Color Recommendation in Vector Graphic Documents","summary":"  Color selection plays a critical role in graphic document design and requires\nsufficient consideration of various contexts. However, recommending appropriate\ncolors which harmonize with the other colors and textual contexts in documents\nis a challenging task, even for experienced designers. In this study, we\npropose a multimodal masked color model that integrates both color and textual\ncontexts to provide text-aware color recommendation for graphic documents. Our\nproposed model comprises self-attention networks to capture the relationships\nbetween colors in multiple palettes, and cross-attention networks that\nincorporate both color and CLIP-based text representations. Our proposed method\nprimarily focuses on color palette completion, which recommends colors based on\nthe given colors and text. Additionally, it is applicable for another color\nrecommendation task, full palette generation, which generates a complete color\npalette corresponding to the given text. Experimental results demonstrate that\nour proposed approach surpasses previous color palette completion methods on\naccuracy, color distribution, and user experience, as well as full palette\ngeneration methods concerning color diversity and similarity to the ground\ntruth palettes.\n","authors":["Qianru Qiu","Xueting Wang","Mayu Otani"],"pdf_url":"https://arxiv.org/pdf/2308.04118v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.06209v2","updated":"2023-08-08T08:06:48Z","published":"2023-03-10T21:17:14Z","title":"SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation\n  for Autonomous Driving","summary":"  Unsupervised optical flow estimation is especially hard near occlusions and\nmotion boundaries and in low-texture regions. We show that additional\ninformation such as semantics and domain knowledge can help better constrain\nthis problem. We introduce SemARFlow, an unsupervised optical flow network\ndesigned for autonomous driving data that takes estimated semantic segmentation\nmasks as additional inputs. This additional information is injected into the\nencoder and into a learned upsampler that refines the flow output. In addition,\na simple yet effective semantic augmentation module provides self-supervision\nwhen learning flow and its boundaries for vehicles, poles, and sky. Together,\nthese injections of semantic information improve the KITTI-2015 optical flow\ntest error rate from 11.80% to 8.38%. We also show visible improvements around\nobject boundaries as well as a greater ability to generalize across datasets.\nCode is available at\nhttps://github.com/duke-vision/semantic-unsup-flow-release.\n","authors":["Shuai Yuan","Shuzhi Yu","Hannah Kim","Carlo Tomasi"],"pdf_url":"https://arxiv.org/pdf/2303.06209v2.pdf","comment":"Accepted by ICCV-2023; Code is available at\n  https://github.com/duke-vision/semantic-unsup-flow-release"},{"id":"http://arxiv.org/abs/2307.14016v3","updated":"2023-08-08T07:57:15Z","published":"2023-07-26T07:57:56Z","title":"RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition","summary":"  Palmprint recently shows great potential in recognition applications as it is\na privacy-friendly and stable biometric. However, the lack of large-scale\npublic palmprint datasets limits further research and development of palmprint\nrecognition. In this paper, we propose a novel realistic pseudo-palmprint\ngeneration (RPG) model to synthesize palmprints with massive identities. We\nfirst introduce a conditional modulation generator to improve the intra-class\ndiversity. Then an identity-aware loss is proposed to ensure identity\nconsistency against unpaired training. We further improve the B\\'ezier palm\ncreases generation strategy to guarantee identity independence. Extensive\nexperimental results demonstrate that synthetic pretraining significantly\nboosts the recognition model performance. For example, our model improves the\nstate-of-the-art B\\'ezierPalm by more than $5\\%$ and $14\\%$ in terms of\nTAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only\n$10\\%$ of the real training data, our method still outperforms ArcFace with\n$100\\%$ real training data, indicating that we are closer to real-data-free\npalmprint recognition.\n","authors":["Lei Shen","Jianlong Jin","Ruixin Zhang","Huaen Li","Kai Zhao","Yingyi Zhang","Jingyun Zhang","Shouhong Ding","Yang Zhao","Wei Jia"],"pdf_url":"https://arxiv.org/pdf/2307.14016v3.pdf","comment":"12 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.03463v2","updated":"2023-08-08T07:54:55Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao"],"pdf_url":"https://arxiv.org/pdf/2308.03463v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2202.04680v2","updated":"2023-08-08T07:36:57Z","published":"2022-02-09T19:03:05Z","title":"Lifting-based variational multiclass segmentation: design, analysis and\n  implementation","summary":"  We propose, analyze and realize a variational multiclass segmentation scheme\nthat partitions a given image into multiple regions exhibiting specific\nproperties. Our method determines multiple functions that encode the\nsegmentation regions by minimizing an energy functional combining information\nfrom different channels. Multichannel image data can be obtained by lifting the\nimage into a higher dimensional feature space using specific multichannel\nfiltering or may already be provided by the imaging modality under\nconsideration, such as an RGB image or multimodal medical data. Experimental\nresults show that the proposed method performs well in various scenarios. In\nparticular, promising results are presented for two medical applications\ninvolving classification of brain abscess and tumor growth, respectively. As\nmain theoretical contributions, we prove the existence of global minimizers of\nthe proposed energy functional and show its stability and convergence with\nrespect to noisy inputs. In particular, these results also apply to the special\ncase of binary segmentation, and these results are also novel in this\nparticular situation.\n","authors":["Nadja Gruber","Johannes Schwab","Sebastien Court","Elke Gizewski","Markus Haltmeier"],"pdf_url":"https://arxiv.org/pdf/2202.04680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04091v1","updated":"2023-08-08T07:15:23Z","published":"2023-08-08T07:15:23Z","title":"From Unimodal to Multimodal: improving the sEMG-Based Pattern\n  Recognition via deep generative models","summary":"  Multimodal hand gesture recognition (HGR) systems can achieve higher\nrecognition accuracy. However, acquiring multimodal gesture recognition data\ntypically requires users to wear additional sensors, thereby increasing\nhardware costs. This paper proposes a novel generative approach to improve\nSurface Electromyography (sEMG)-based HGR accuracy via virtual Inertial\nMeasurement Unit (IMU) signals. Specifically, we trained a deep generative\nmodel based on the intrinsic correlation between forearm sEMG signals and\nforearm IMU signals to generate virtual forearm IMU signals from the input\nforearm sEMG signals at first. Subsequently, the sEMG signals and virtual IMU\nsignals were fed into a multimodal Convolutional Neural Network (CNN) model for\ngesture recognition. To evaluate the performance of the proposed approach, we\nconducted experiments on 6 databases, including 5 publicly available databases\nand our collected database comprising 28 subjects performing 38 gestures,\ncontaining both sEMG and IMU data. The results show that our proposed approach\noutperforms the sEMG-based unimodal HGR method (with increases of\n2.15%-13.10%). It demonstrates that incorporating virtual IMU signals,\ngenerated by deep generative models, can significantly enhance the accuracy of\nsEMG-based HGR. The proposed approach represents a successful attempt to\ntransition from unimodal HGR to multimodal HGR without additional sensor\nhardware.\n","authors":["Wentao Wei","Linyan Ren"],"pdf_url":"https://arxiv.org/pdf/2308.04091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09880v3","updated":"2023-08-08T07:02:16Z","published":"2023-05-17T01:27:27Z","title":"A survey of the Vision Transformers and its CNN-Transformer based\n  Variants","summary":"  Vision transformers have become popular as a possible substitute to\nconvolutional neural networks (CNNs) for a variety of computer vision\napplications. These transformers, with their ability to focus on global\nrelationships in images, offer large learning capacity. However, they may\nsuffer from limited generalization as they do not tend to model local\ncorrelation in images. Recently, in vision transformers hybridization of both\nthe convolution operation and self-attention mechanism has emerged, to exploit\nboth the local and global image representations. These hybrid vision\ntransformers, also referred to as CNN-Transformer architectures, have\ndemonstrated remarkable results in vision applications. Given the rapidly\ngrowing number of hybrid vision transformers, it has become necessary to\nprovide a taxonomy and explanation of these hybrid architectures. This survey\npresents a taxonomy of the recent vision transformer architectures and more\nspecifically that of the hybrid vision transformers. Additionally, the key\nfeatures of these architectures such as the attention mechanisms, positional\nembeddings, multi-scale processing, and convolution are also discussed. In\ncontrast to the previous survey papers that are primarily focused on individual\nvision transformer architectures or CNNs, this survey uniquely emphasizes the\nemerging trend of hybrid vision transformers. By showcasing the potential of\nhybrid vision transformers to deliver exceptional performance across a range of\ncomputer vision tasks, this survey sheds light on the future directions of this\nrapidly evolving architecture.\n","authors":["Asifullah Khan","Zunaira Rauf","Anabia Sohail","Abdul Rehman","Hifsa Asif","Aqsa Asif","Umair Farooq"],"pdf_url":"https://arxiv.org/pdf/2305.09880v3.pdf","comment":"Pages: 58, Figures: 14"},{"id":"http://arxiv.org/abs/2308.01006v3","updated":"2023-08-08T06:45:25Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n  Autonomous Driving","summary":"  Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04079v1","updated":"2023-08-08T06:37:06Z","published":"2023-08-08T06:37:06Z","title":"3D Gaussian Splatting for Real-Time Radiance Field Rendering","summary":"  Radiance Field methods have recently revolutionized novel-view synthesis of\nscenes captured with multiple photos or videos. However, achieving high visual\nquality still requires neural networks that are costly to train and render,\nwhile recent faster methods inevitably trade off speed for quality. For\nunbounded and complete scenes (rather than isolated objects) and 1080p\nresolution rendering, no current method can achieve real-time display rates. We\nintroduce three key elements that allow us to achieve state-of-the-art visual\nquality while maintaining competitive training times and importantly allow\nhigh-quality real-time (>= 30 fps) novel-view synthesis at 1080p resolution.\nFirst, starting from sparse points produced during camera calibration, we\nrepresent the scene with 3D Gaussians that preserve desirable properties of\ncontinuous volumetric radiance fields for scene optimization while avoiding\nunnecessary computation in empty space; Second, we perform interleaved\noptimization/density control of the 3D Gaussians, notably optimizing\nanisotropic covariance to achieve an accurate representation of the scene;\nThird, we develop a fast visibility-aware rendering algorithm that supports\nanisotropic splatting and both accelerates training and allows realtime\nrendering. We demonstrate state-of-the-art visual quality and real-time\nrendering on several established datasets.\n","authors":["Bernhard Kerbl","Georgios Kopanas","Thomas Leimkühler","George Drettakis"],"pdf_url":"https://arxiv.org/pdf/2308.04079v1.pdf","comment":"https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/"},{"id":"http://arxiv.org/abs/2308.04074v1","updated":"2023-08-08T06:16:37Z","published":"2023-08-08T06:16:37Z","title":"Exploiting Spatial-Temporal Context for Interacting Hand Reconstruction\n  on Monocular RGB Video","summary":"  Reconstructing interacting hands from monocular RGB data is a challenging\ntask, as it involves many interfering factors, e.g. self- and mutual occlusion\nand similar textures. Previous works only leverage information from a single\nRGB image without modeling their physically plausible relation, which leads to\ninferior reconstruction results. In this work, we are dedicated to explicitly\nexploiting spatial-temporal information to achieve better interacting hand\nreconstruction. On one hand, we leverage temporal context to complement\ninsufficient information provided by the single frame, and design a novel\ntemporal framework with a temporal constraint for interacting hand motion\nsmoothness. On the other hand, we further propose an interpenetration detection\nmodule to produce kinetically plausible interacting hands without physical\ncollisions. Extensive experiments are performed to validate the effectiveness\nof our proposed framework, which achieves new state-of-the-art performance on\npublic benchmarks.\n","authors":["Weichao Zhao","Hezhen Hu","Wengang Zhou","Li li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.04074v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.04070v1","updated":"2023-08-08T06:07:49Z","published":"2023-08-08T06:07:49Z","title":"ConDistFL: Conditional Distillation for Federated Learning from\n  Partially Annotated Data","summary":"  Developing a generalized segmentation model capable of simultaneously\ndelineating multiple organs and diseases is highly desirable. Federated\nlearning (FL) is a key technology enabling the collaborative development of a\nmodel without exchanging training data. However, the limited access to fully\nannotated training data poses a major challenge to training generalizable\nmodels. We propose \"ConDistFL\", a framework to solve this problem by combining\nFL with knowledge distillation. Local models can extract the knowledge of\nunlabeled organs and tumors from partially annotated data from the global model\nwith an adequately designed conditional probability representation. We validate\nour framework on four distinct partially annotated abdominal CT datasets from\nthe MSD and KiTS19 challenges. The experimental results show that the proposed\nframework significantly outperforms FedAvg and FedOpt baselines. Moreover, the\nperformance on an external test dataset demonstrates superior generalizability\ncompared to models trained on each dataset separately. Our ablation study\nsuggests that ConDistFL can perform well without frequent aggregation, reducing\nthe communication cost of FL. Our implementation will be available at\nhttps://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.\n","authors":["Pochuan Wang","Chen Shen","Weichung Wang","Masahiro Oda","Chiou-Shann Fuh","Kensaku Mori","Holger R. Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05785v2","updated":"2023-08-08T06:06:35Z","published":"2022-09-13T07:37:53Z","title":"Adversarial Coreset Selection for Efficient Robust Training","summary":"  Neural networks are vulnerable to adversarial attacks: adding well-crafted,\nimperceptible perturbations to their input can modify their output. Adversarial\ntraining is one of the most effective approaches to training robust models\nagainst such attacks. Unfortunately, this method is much slower than vanilla\ntraining of neural networks since it needs to construct adversarial examples\nfor the entire training data at every iteration. By leveraging the theory of\ncoreset selection, we show how selecting a small subset of training data\nprovides a principled approach to reducing the time complexity of robust\ntraining. To this end, we first provide convergence guarantees for adversarial\ncoreset selection. In particular, we show that the convergence bound is\ndirectly related to how well our coresets can approximate the gradient computed\nover the entire training data. Motivated by our theoretical analysis, we\npropose using this gradient approximation error as our adversarial coreset\nselection objective to reduce the training set size effectively. Once built, we\nrun adversarial training over this subset of the training data. Unlike existing\nmethods, our approach can be adapted to a wide variety of training objectives,\nincluding TRADES, $\\ell_p$-PGD, and Perceptual Adversarial Training. We conduct\nextensive experiments to demonstrate that our approach speeds up adversarial\ntraining by 2-3 times while experiencing a slight degradation in the clean and\nrobust accuracy.\n","authors":["Hadi M. Dolatabadi","Sarah Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2209.05785v2.pdf","comment":"Accepted to the International Journal of Computer Vision (IJCV).\n  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:\n  substantial text overlap with arXiv:2112.00378"},{"id":"http://arxiv.org/abs/2305.01160v3","updated":"2023-08-08T05:59:58Z","published":"2023-05-02T02:29:18Z","title":"Long-Tailed Recognition by Mutual Information Maximization between\n  Latent Features and Ground-Truth Labels","summary":"  Although contrastive learning methods have shown prevailing performance on a\nvariety of representation learning tasks, they encounter difficulty when the\ntraining dataset is long-tailed. Many researchers have combined contrastive\nlearning and a logit adjustment technique to address this problem, but the\ncombinations are done ad-hoc and a theoretical background has not yet been\nprovided. The goal of this paper is to provide the background and further\nimprove the performance. First, we show that the fundamental reason contrastive\nlearning methods struggle with long-tailed tasks is that they try to maximize\nthe mutual information maximization between latent features and input data. As\nground-truth labels are not considered in the maximization, they are not able\nto address imbalances between class labels. Rather, we interpret the\nlong-tailed recognition task as a mutual information maximization between\nlatent features and ground-truth labels. This approach integrates contrastive\nlearning and logit adjustment seamlessly to derive a loss function that shows\nstate-of-the-art performance on long-tailed recognition benchmarks. It also\ndemonstrates its efficacy in image segmentation tasks, verifying its\nversatility beyond image classification.\n","authors":["Min-Kook Suh","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2305.01160v3.pdf","comment":"ICML 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.03529v2","updated":"2023-08-08T05:29:57Z","published":"2023-08-07T12:26:34Z","title":"Feature Decoupling-Recycling Network for Fast Interactive Segmentation","summary":"  Recent interactive segmentation methods iteratively take source image, user\nguidance and previously predicted mask as the input without considering the\ninvariant nature of the source image. As a result, extracting features from the\nsource image is repeated in each interaction, resulting in substantial\ncomputational redundancy. In this work, we propose the Feature\nDecoupling-Recycling Network (FDRN), which decouples the modeling components\nbased on their intrinsic discrepancies and then recycles components for each\nuser interaction. Thus, the efficiency of the whole interactive process can be\nsignificantly improved. To be specific, we apply the Decoupling-Recycling\nstrategy from three perspectives to address three types of discrepancies,\nrespectively. First, our model decouples the learning of source image semantics\nfrom the encoding of user guidance to process two types of input domains\nseparately. Second, FDRN decouples high-level and low-level features from\nstratified semantic representations to enhance feature learning. Third, during\nthe encoding of user guidance, current user guidance is decoupled from\nhistorical guidance to highlight the effect of current user guidance. We\nconduct extensive experiments on 6 datasets from different domains and\nmodalities, which demonstrate the following merits of our model: 1) superior\nefficiency than other methods, particularly advantageous in challenging\nscenarios requiring long-term interactions (up to 4.25x faster), while\nachieving favorable segmentation performance; 2) strong applicability to\nvarious methods serving as a universal enhancement technique; 3) well\ncross-task generalizability, e.g., to medical image segmentation, and\nrobustness against misleading user guidance.\n","authors":["Huimin Zeng","Weinong Wang","Xin Tao","Zhiwei Xiong","Yu-Wing Tai","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03529v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04054v1","updated":"2023-08-08T05:29:26Z","published":"2023-08-08T05:29:26Z","title":"An Empirical Analysis of Range for 3D Object Detection","summary":"  LiDAR-based 3D detection plays a vital role in autonomous navigation.\nSurprisingly, although autonomous vehicles (AVs) must detect both near-field\nobjects (for collision avoidance) and far-field objects (for longer-term\nplanning), contemporary benchmarks focus only on near-field 3D detection.\nHowever, AVs must detect far-field objects for safe navigation. In this paper,\nwe present an empirical analysis of far-field 3D detection using the long-range\ndetection dataset Argoverse 2.0 to better understand the problem, and share the\nfollowing insight: near-field LiDAR measurements are dense and optimally\nencoded by small voxels, while far-field measurements are sparse and are better\nencoded with large voxels. We exploit this observation to build a collection of\nrange experts tuned for near-vs-far field detection, and propose simple\ntechniques to efficiently ensemble models for long-range detection that improve\nefficiency by 33% and boost accuracy by 3.2% CDS.\n","authors":["Neehar Peri","Mengtian Li","Benjamin Wilson","Yu-Xiong Wang","James Hays","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2308.04054v1.pdf","comment":"Accepted to ICCV 2023 Workshop - Robustness and Reliability of\n  Autonomous Vehicles in the Open-World"},{"id":"http://arxiv.org/abs/2308.03177v2","updated":"2023-08-08T05:26:45Z","published":"2023-08-06T18:07:45Z","title":"Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided\n  Enhancement","summary":"  Although extensive research has been conducted on 3D point cloud\nsegmentation, effectively adapting generic models to novel categories remains a\nformidable challenge. This paper proposes a novel approach to improve point\ncloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods\nthat directly utilize categorical information from support prototypes to\nrecognize novel classes in query samples, our method identifies two critical\naspects that substantially enhance model performance by reducing contextual\ngaps between support prototypes and query features. Specifically, we (1) adapt\nsupport background prototypes to match query context while removing extraneous\ncues that may obscure foreground and background in query samples, and (2)\nholistically rectify support prototypes under the guidance of query features to\nemulate the latter having no semantic gap to the query targets. Our proposed\ndesigns are agnostic to the feature extractor, rendering them readily\napplicable to any prototype-based methods. The experimental results on S3DIS\nand ScanNet demonstrate notable practical benefits, as our approach achieves\nsignificant improvements while still maintaining high efficiency. The code for\nour approach is available at\nhttps://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement\n","authors":["Zhenhua Ning","Zhuotao Tian","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03177v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2306.16670v3","updated":"2023-08-08T05:00:58Z","published":"2023-06-29T04:05:13Z","title":"End-to-End Learnable Multi-Scale Feature Compression for VCM","summary":"  The proliferation of deep learning-based machine vision applications has\ngiven rise to a new type of compression, so called video coding for machine\n(VCM). VCM differs from traditional video coding in that it is optimized for\nmachine vision performance instead of human visual quality. In the feature\ncompression track of MPEG-VCM, multi-scale features extracted from images are\nsubject to compression. Recent feature compression works have demonstrated that\nthe versatile video coding (VVC) standard-based approach can achieve a BD-rate\nreduction of up to 96% against MPEG-VCM feature anchor. However, it is still\nsub-optimal as VVC was not designed for extracted features but for natural\nimages. Moreover, the high encoding complexity of VVC makes it difficult to\ndesign a lightweight encoder without sacrificing performance. To address these\nchallenges, we propose a novel multi-scale feature compression method that\nenables both the end-to-end optimization on the extracted features and the\ndesign of lightweight encoders. The proposed model combines a learnable\ncompressor with a multi-scale feature fusion network so that the redundancy in\nthe multi-scale features is effectively removed. Instead of simply cascading\nthe fusion network and the compression network, we integrate the fusion and\nencoding processes in an interleaved way. Our model first encodes a\nlarger-scale feature to obtain a latent representation and then fuses the\nlatent with a smaller-scale feature. This process is successively performed\nuntil the smallest-scale feature is fused and then the encoded latent at the\nfinal stage is entropy-coded for transmission. The results show that our model\noutperforms previous approaches by at least 52% BD-rate reduction and has\n$\\times5$ to $\\times27$ times less encoding time for object detection...\n","authors":["Yeongwoong Kim","Hyewon Jeong","Janghyun Yu","Younhee Kim","Jooyoung Lee","Se Yoon Jeong","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2306.16670v3.pdf","comment":"13 pages, accepted by IEEE Transactions on Circuits and Systems for\n  Video Technology"},{"id":"http://arxiv.org/abs/2308.04047v1","updated":"2023-08-08T04:53:52Z","published":"2023-08-08T04:53:52Z","title":"SODFormer: Streaming Object Detection with Transformer Using Events and\n  Frames","summary":"  DAVIS camera, streaming two complementary sensing modalities of asynchronous\nevents and frames, has gradually been used to address major object detection\nchallenges (e.g., fast motion blur and low-light). However, how to effectively\nleverage rich temporal cues and fuse two heterogeneous visual streams remains a\nchallenging endeavor. To address this challenge, we propose a novel streaming\nobject detector with Transformer, namely SODFormer, which first integrates\nevents and frames to continuously detect objects in an asynchronous manner.\nTechnically, we first build a large-scale multimodal neuromorphic object\ndetection dataset (i.e., PKU-DAVIS-SOD) over 1080.1k manual labels. Then, we\ndesign a spatiotemporal Transformer architecture to detect objects via an\nend-to-end sequence prediction problem, where the novel temporal Transformer\nmodule leverages rich temporal cues from two visual streams to improve the\ndetection performance. Finally, an asynchronous attention-based fusion module\nis proposed to integrate two heterogeneous sensing modalities and take\ncomplementary advantages from each end, which can be queried at any time to\nlocate objects and break through the limited output frequency from synchronized\nframe-based fusion strategies. The results show that the proposed SODFormer\noutperforms four state-of-the-art methods and our eight baselines by a\nsignificant margin. We also show that our unifying framework works well even in\ncases where the conventional frame-based camera fails, e.g., high-speed motion\nand low-light conditions. Our dataset and code can be available at\nhttps://github.com/dianzl/SODFormer.\n","authors":["Dianze Li","Jianing Li","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04047v1.pdf","comment":"18 pages, 15 figures, in IEEE Transactions on Pattern Analysis and\n  Machine Intelligence"},{"id":"http://arxiv.org/abs/2308.04039v1","updated":"2023-08-08T04:30:42Z","published":"2023-08-08T04:30:42Z","title":"Implicit neural representations for joint decomposition and registration\n  of gene expression images in the marmoset brain","summary":"  We propose a novel image registration method based on implicit neural\nrepresentations that addresses the challenging problem of registering a pair of\nbrain images with similar anatomical structures, but where one image contains\nadditional features or artifacts that are not present in the other image. To\ndemonstrate its effectiveness, we use 2D microscopy $\\textit{in situ}$\nhybridization gene expression images of the marmoset brain. Accurately\nquantifying gene expression requires image registration to a brain template,\nwhich is difficult due to the diversity of patterns causing variations in\nvisible anatomical brain structures. Our approach uses implicit networks in\ncombination with an image exclusion loss to jointly perform the registration\nand decompose the image into a support and residual image. The support image\naligns well with the template, while the residual image captures individual\nimage characteristics that diverge from the template. In experiments, our\nmethod provided excellent results and outperformed other registration\ntechniques.\n","authors":["Michal Byra","Charissa Poon","Tomomi Shimogori","Henrik Skibbe"],"pdf_url":"https://arxiv.org/pdf/2308.04039v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2201.01615v3","updated":"2023-08-08T04:17:55Z","published":"2022-01-05T13:51:20Z","title":"Lawin Transformer: Improving New-Era Vision Backbones with Multi-Scale\n  Representations for Semantic Segmentation","summary":"  The multi-level aggregation (MLA) module has emerged as a critical component\nfor advancing new-era vision back-bones in semantic segmentation. In this\npaper, we propose Lawin (large window) Transformer, a novel MLA architecture\nthat creatively utilizes multi-scale feature maps from the vision backbone. At\nthe core of Lawin Transformer is the Lawin attention, a newly designed window\nattention mechanism capable of querying much larger context windows than local\nwindows. We focus on studying the efficient and simplistic application of the\nlarge-window paradigm, allowing for flexible regulation of the ratio of large\ncontext to query and capturing multi-scale representations. We validate the\neffectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently\ndemonstrating great superiority to widely-used MLA modules when combined with\nnew-era vision backbones. The code is available at\nhttps://github.com/yan-hao-tian/lawin.\n","authors":["Haotian Yan","Chuang Zhang","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2201.01615v3.pdf","comment":"The latest version has really big differences from the original\n  version, which may make the reader confused. We will submit the latest\n  version as another article"},{"id":"http://arxiv.org/abs/2308.03698v2","updated":"2023-08-08T03:40:53Z","published":"2023-08-07T16:14:27Z","title":"Screen-based 3D Subjective Experiment Software","summary":"  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn\nconsiderable efforts from academia and industry to assess their perceptual\nquality by conducting subjective experiments. However, lacking a handy software\nfor 3D subjective experiments complicates the construction of 3D graphics\nquality assessment datasets, thus hindering the prosperity of relevant fields.\nIn this paper, we develop a powerful platform with which users can flexibly\ndesign their 3D subjective methodologies and build high-quality datasets,\neasing a broad spectrum of 3D graphics subjective quality study. To accurately\nillustrate the perceptual quality differences of 3D stimuli, our software can\nsimultaneously render the source stimulus and impaired stimulus and allows both\nstimuli to respond synchronously to viewer interactions. Compared with amateur\n3D visualization tool-based or image/video rendering-based schemes, our\napproach embodies typical 3D applications while minimizing cognitive overload\nduring subjective experiments. We organized a subjective experiment involving\n40 participants to verify the validity of the proposed software. Experimental\nanalyses demonstrate that subjective tests on our software can produce\nreasonable subjective quality scores of 3D models. All resources in this paper\ncan be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.\n","authors":["Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2308.03698v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04020v1","updated":"2023-08-08T03:34:04Z","published":"2023-08-08T03:34:04Z","title":"Synthetic Augmentation with Large-scale Unconditional Pre-training","summary":"  Deep learning based medical image recognition systems often require a\nsubstantial amount of training data with expert annotations, which can be\nexpensive and time-consuming to obtain. Recently, synthetic augmentation\ntechniques have been proposed to mitigate the issue by generating realistic\nimages conditioned on class labels. However, the effectiveness of these methods\nheavily depends on the representation capability of the trained generative\nmodel, which cannot be guaranteed without sufficient labeled training data. To\nfurther reduce the dependency on annotated data, we propose a synthetic\naugmentation method called HistoDiffusion, which can be pre-trained on\nlarge-scale unlabeled datasets and later applied to a small-scale labeled\ndataset for augmented training. In particular, we train a latent diffusion\nmodel (LDM) on diverse unlabeled datasets to learn common features and generate\nrealistic images without conditional inputs. Then, we fine-tune the model with\nclassifier guidance in latent space on an unseen labeled dataset so that the\nmodel can synthesize images of specific categories. Additionally, we adopt a\nselective mechanism to only add synthetic samples with high confidence of\nmatching to target labels. We evaluate our proposed method by pre-training on\nthree histopathology datasets and testing on a histopathology dataset of\ncolorectal cancer (CRC) excluded from the pre-training datasets. With\nHistoDiffusion augmentation, the classification accuracy of a backbone\nclassifier is remarkably improved by 6.4% using a small set of the original\nlabels. Our code is available at https://github.com/karenyyy/HistoDiffAug.\n","authors":["Jiarong Ye","Haomiao Ni","Peng Jin","Sharon X. Huang","Yuan Xue"],"pdf_url":"https://arxiv.org/pdf/2308.04020v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.04016v1","updated":"2023-08-08T03:24:21Z","published":"2023-08-08T03:24:21Z","title":"Hierarchical Visual Primitive Experts for Compositional Zero-Shot\n  Learning","summary":"  Compositional zero-shot learning (CZSL) aims to recognize unseen compositions\nwith prior knowledge of known primitives (attribute and object). Previous works\nfor CZSL often suffer from grasping the contextuality between attribute and\nobject, as well as the discriminability of visual features, and the long-tailed\ndistribution of real-world compositional data. We propose a simple and scalable\nframework called Composition Transformer (CoT) to address these issues. CoT\nemploys object and attribute experts in distinctive manners to generate\nrepresentative embeddings, using the visual network hierarchically. The object\nexpert extracts representative object embeddings from the final layer in a\nbottom-up manner, while the attribute expert makes attribute embeddings in a\ntop-down manner with a proposed object-guided attention module that models\ncontextuality explicitly. To remedy biased prediction caused by imbalanced data\ndistribution, we develop a simple minority attribute augmentation (MAA) that\nsynthesizes virtual samples by mixing two images and oversampling minority\nattribute classes. Our method achieves SoTA performance on several benchmarks,\nincluding MIT-States, C-GQA, and VAW-CZSL. We also demonstrate the\neffectiveness of CoT in improving visual discrimination and addressing the\nmodel bias from the imbalanced data distribution. The code is available at\nhttps://github.com/HanjaeKim98/CoT.\n","authors":["Hanjae Kim","Jiyoung Lee","Seongheon Park","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.04016v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04008v1","updated":"2023-08-08T03:06:10Z","published":"2023-08-08T03:06:10Z","title":"Coarse-to-Fine: Learning Compact Discriminative Representation for\n  Single-Stage Image Retrieval","summary":"  Image retrieval targets to find images from a database that are visually\nsimilar to the query image. Two-stage methods following retrieve-and-rerank\nparadigm have achieved excellent performance, but their separate local and\nglobal modules are inefficient to real-world applications. To better trade-off\nretrieval efficiency and accuracy, some approaches fuse global and local\nfeature into a joint representation to perform single-stage image retrieval.\nHowever, they are still challenging due to various situations to tackle,\n$e.g.$, background, occlusion and viewpoint. In this work, we design a\nCoarse-to-Fine framework to learn Compact Discriminative representation (CFCD)\nfor end-to-end single-stage image retrieval-requiring only image-level labels.\nSpecifically, we first design a novel adaptive softmax-based loss which\ndynamically tunes its scale and margin within each mini-batch and increases\nthem progressively to strengthen supervision during training and intra-class\ncompactness. Furthermore, we propose a mechanism which attentively selects\nprominent local descriptors and infuse fine-grained semantic relations into the\nglobal representation by a hard negative sampling strategy to optimize\ninter-class distinctiveness at a global scale. Extensive experimental results\nhave demonstrated the effectiveness of our method, which achieves\nstate-of-the-art single-stage image retrieval performance on benchmarks such as\nRevisited Oxford and Revisited Paris. Code is available at\nhttps://github.com/bassyess/CFCD.\n","authors":["Yunquan Zhu","Xinkai Gao","Bo Ke","Ruizhi Qiao","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04008v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04005v1","updated":"2023-08-08T02:48:46Z","published":"2023-08-08T02:48:46Z","title":"Few-shot medical image classification with simple shape and texture text\n  descriptors using vision-language models","summary":"  In this work, we investigate the usefulness of vision-language models (VLMs)\nand large language models for binary few-shot classification of medical images.\nWe utilize the GPT-4 model to generate text descriptors that encapsulate the\nshape and texture characteristics of objects in medical images. Subsequently,\nthese GPT-4 generated descriptors, alongside VLMs pre-trained on natural\nimages, are employed to classify chest X-rays and breast ultrasound images. Our\nresults indicate that few-shot classification of medical images using VLMs and\nGPT-4 generated descriptors is a viable approach. However, accurate\nclassification requires to exclude certain descriptors from the calculations of\nthe classification scores. Moreover, we assess the ability of VLMs to evaluate\nshape features in breast mass ultrasound images. We further investigate the\ndegree of variability among the sets of text descriptors produced by GPT-4. Our\nwork provides several important insights about the application of VLMs for\nmedical image analysis.\n","authors":["Michal Byra","Muhammad Febrian Rachmadi","Henrik Skibbe"],"pdf_url":"https://arxiv.org/pdf/2308.04005v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.10044v3","updated":"2023-08-08T02:40:05Z","published":"2023-05-17T08:37:26Z","title":"Two-Stream Regression Network for Dental Implant Position Prediction","summary":"  In implant prosthesis treatment, the design of the surgical guide heavily\nrelies on the manual location of the implant position, which is subjective and\nprone to doctor's experiences. When deep learning based methods has started to\nbe applied to address this problem, the space between teeth are various and\nsome of them might present similar texture characteristic with the actual\nimplant region. Both problems make a big challenge for the implant position\nprediction. In this paper, we develop a two-stream implant position regression\nframework (TSIPR), which consists of an implant region detector (IRD) and a\nmulti-scale patch embedding regression network (MSPENet), to address this\nissue. For the training of IRD, we extend the original annotation to provide\nadditional supervisory information, which contains much more rich\ncharacteristic and do not introduce extra labeling costs. A multi-scale patch\nembedding module is designed for the MSPENet to adaptively extract features\nfrom the images with various tooth spacing. The global-local feature\ninteraction block is designed to build the encoder of MSPENet, which combines\nthe transformer and convolution for enriched feature representation. During\ninference, the RoI mask extracted from the IRD is used to refine the prediction\nresults of the MSPENet. Extensive experiments on a dental implant dataset\nthrough five-fold cross-validation demonstrated that the proposed TSIPR\nachieves superior performance than existing methods.\n","authors":["Xinquan Yang","Xuguang Li","Xuechen Li","Wenting Chen","Linlin Shen","Xin Li","Yongqiang Deng"],"pdf_url":"https://arxiv.org/pdf/2305.10044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v3","updated":"2023-08-08T02:32:24Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":"  In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: blood smear, lung infection,\nhazardous driving, wooden, concrete deterioration, river sludge, and disaster\ndamage. Illustrated in Fig.1, we provide key results on damage vision mining\nadvantage, hypothesizing that the more effective range of positive ratio, the\nhigher accuracy gain of anomaly detection application. In our imbalanced\nstudies, compared with the balanced case of positive ratio 1/1, we find that\nthere is applicable positive ratio, where the accuracy are consistently high.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v3.pdf","comment":"21 pages, 29 figures, 18 tables"},{"id":"http://arxiv.org/abs/2308.02494v2","updated":"2023-08-08T02:32:04Z","published":"2023-07-16T19:36:19Z","title":"Adaptively Placed Multi-Grid Scene Representation Networks for\n  Large-Scale Data Visualization","summary":"  Scene representation networks (SRNs) have been recently proposed for\ncompression and visualization of scientific data. However, state-of-the-art\nSRNs do not adapt the allocation of available network parameters to the complex\nfeatures found in scientific data, leading to a loss in reconstruction quality.\nWe address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)\nand propose a domain decomposition training and inference technique for\naccelerated parallel training on multi-GPU systems. We also release an\nopen-source neural volume rendering application that allows plug-and-play\nrendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses\nmultiple spatially adaptive feature grids that learn where to be placed within\nthe domain to dynamically allocate more neural network resources where error is\nhigh in the volume, improving state-of-the-art reconstruction accuracy of SRNs\nfor scientific data without requiring expensive octree refining, pruning, and\ntraversal like previous adaptive models. In our domain decomposition approach\nfor representing large-scale data, we train an set of APMGSRNs in parallel on\nseparate bricks of the volume to reduce training time while avoiding overhead\nnecessary for an out-of-core solution for volumes too large to fit in GPU\nmemory. After training, the lightweight SRNs are used for realtime neural\nvolume rendering in our open-source renderer, where arbitrary view angles and\ntransfer functions can be explored. A copy of this paper, all code, all models\nused in our experiments, and all supplemental materials and videos are\navailable at https://github.com/skywolf829/APMGSRN.\n","authors":["Skylar Wolfgang Wurster","Tianyu Xiong","Han-Wei Shen","Hanqi Guo","Tom Peterka"],"pdf_url":"https://arxiv.org/pdf/2308.02494v2.pdf","comment":"Accepted to IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.03999v1","updated":"2023-08-08T02:28:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, de-mystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03998v1","updated":"2023-08-08T02:28:48Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n  for Robotic Harvesting in open-field environment","summary":"  This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khana","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v1.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2307.02227v2","updated":"2023-08-08T02:19:48Z","published":"2023-07-05T12:08:56Z","title":"MAE-DFER: Efficient Masked Autoencoder for Self-supervised Dynamic\n  Facial Expression Recognition","summary":"  Dynamic facial expression recognition (DFER) is essential to the development\nof intelligent and empathetic machines. Prior efforts in this field mainly fall\ninto supervised learning paradigm, which is severely restricted by the limited\nlabeled data in existing datasets. Inspired by recent unprecedented success of\nmasked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel\nself-supervised method which leverages large-scale self-supervised pre-training\non abundant unlabeled data to largely advance the development of DFER. Since\nthe vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial\ncomputation during fine-tuning, MAE-DFER develops an efficient local-global\ninteraction Transformer (LGI-Former) as the encoder. Moreover, in addition to\nthe standalone appearance content reconstruction in VideoMAE, MAE-DFER also\nintroduces explicit temporal facial motion modeling to encourage LGI-Former to\nexcavate both static appearance and dynamic motion information. Extensive\nexperiments on six datasets show that MAE-DFER consistently outperforms\nstate-of-the-art supervised methods by significant margins (e.g., +6.30\\% UAR\non DFEW and +8.34\\% UAR on MAFW), verifying that it can learn powerful dynamic\nfacial representations via large-scale self-supervised pre-training. Besides,\nit has comparable or even better performance than VideoMAE, while largely\nreducing the computational cost (about 38\\% FLOPs). We believe MAE-DFER has\npaved a new way for the advancement of DFER and can inspire more relevant\nresearch in this field and even other related tasks. Codes and models are\npublicly available at https://github.com/sunlicai/MAE-DFER.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2307.02227v2.pdf","comment":"ACM MM 2023 (camera ready). Codes and models are publicly available\n  at https://github.com/sunlicai/MAE-DFER"},{"id":"http://arxiv.org/abs/2308.03982v1","updated":"2023-08-08T01:59:20Z","published":"2023-08-08T01:59:20Z","title":"PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection","summary":"  Recently, polar-based representation has shown promising properties in\nperceptual tasks. In addition to Cartesian-based approaches, which separate\npoint clouds unevenly, representing point clouds as polar grids has been\nrecognized as an alternative due to (1) its advantage in robust performance\nunder different resolutions and (2) its superiority in streaming-based\napproaches. However, state-of-the-art polar-based detection methods inevitably\nsuffer from the feature distortion problem because of the non-uniform division\nof polar representation, resulting in a non-negligible performance gap compared\nto Cartesian-based approaches. To tackle this issue, we present PARTNER, a\nnovel 3D object detector in the polar coordinate. PARTNER alleviates the\ndilemma of feature distortion with global representation re-alignment and\nfacilitates the regression by introducing instance-level geometric information\ninto the detection head. Extensive experiments show overwhelming advantages in\nstreaming-based detection and different resolutions. Furthermore, our method\noutperforms the previous polar-based works with remarkable margins of 3.68% and\n9.15% on Waymo and ONCE validation set, thus achieving competitive results over\nthe state-of-the-art methods.\n","authors":["Ming Nie","Yujing Xue","Chunwei Wang","Chaoqiang Ye","Hang Xu","Xinge Zhu","Qingqiu Huang","Michael Bi Mi","Xinchao Wang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03982v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03979v1","updated":"2023-08-08T01:55:44Z","published":"2023-08-08T01:55:44Z","title":"PAIF: Perception-Aware Infrared-Visible Image Fusion for Attack-Tolerant\n  Semantic Segmentation","summary":"  Infrared and visible image fusion is a powerful technique that combines\ncomplementary information from different modalities for downstream semantic\nperception tasks. Existing learning-based methods show remarkable performance,\nbut are suffering from the inherent vulnerability of adversarial attacks,\ncausing a significant decrease in accuracy. In this work, a perception-aware\nfusion framework is proposed to promote segmentation robustness in adversarial\nscenes. We first conduct systematic analyses about the components of image\nfusion, investigating the correlation with segmentation robustness under\nadversarial perturbations. Based on these analyses, we propose a harmonized\narchitecture search with a decomposition-based structure to balance standard\naccuracy and robustness. We also propose an adaptive learning strategy to\nimprove the parameter robustness of image fusion, which can learn effective\nfeature extraction under diverse adversarial perturbations. Thus, the goals of\nimage fusion (\\textit{i.e.,} extracting complementary features from source\nmodalities and defending attack) can be realized from the perspectives of\narchitectural and learning strategies. Extensive experimental results\ndemonstrate that our scheme substantially enhances the robustness, with gains\nof 15.3% mIOU of segmentation in the adversarial scene, compared with advanced\ncompetitors. The source codes are available at\nhttps://github.com/LiuZhu-CV/PAIF.\n","authors":["Zhu Liu","Jinyuan Liu","Benzhuang Zhang","Long Ma","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03979v1.pdf","comment":"Accepted by ACM MM'2023;The source codes are available at\n  https://github.com/LiuZhu-CV/PAIF"},{"id":"http://arxiv.org/abs/2308.03276v2","updated":"2023-08-08T01:55:32Z","published":"2023-08-07T03:35:47Z","title":"Spatialyze: A Geospatial Video Analytics System with Spatial-Aware\n  Optimizations","summary":"  Videos that are shot using commodity hardware such as phones and surveillance\ncameras record various metadata such as time and location. We encounter such\ngeospatial videos on a daily basis and such videos have been growing in volume\nsignificantly. Yet, we do not have data management systems that allow users to\ninteract with such data effectively.\n  In this paper, we describe Spatialyze, a new framework for end-to-end\nquerying of geospatial videos. Spatialyze comes with a domain-specific language\nwhere users can construct geospatial video analytic workflows using a 3-step,\ndeclarative, build-filter-observe paradigm. Internally, Spatialyze leverages\nthe declarative nature of such workflows, the temporal-spatial metadata stored\nwith videos, and physical behavior of real-world objects to optimize the\nexecution of workflows. Our results using real-world videos and workflows show\nthat Spatialyze can reduce execution time by up to 5.3x, while maintaining up\nto 97.1% accuracy compared to unoptimized execution.\n","authors":["Chanwut Kittivorawong","Yongming Ge","Yousef Helal","Alvin Cheung"],"pdf_url":"https://arxiv.org/pdf/2308.03276v2.pdf","comment":"GitHub Repository: https://github.com/apperception-db/spatialyze"},{"id":"http://arxiv.org/abs/2301.01635v3","updated":"2023-08-08T01:45:37Z","published":"2023-01-04T14:20:14Z","title":"SPTS v2: Single-Point Scene Text Spotting","summary":"  End-to-end scene text spotting has made significant progress due to its\nintrinsic synergy between text detection and recognition. Previous methods\ncommonly regard manual annotations such as horizontal rectangles, rotated\nrectangles, quadrangles, and polygons as a prerequisite, which are much more\nexpensive than using single-point. Our new framework, SPTS v2, allows us to\ntrain high-performing text-spotting models using a single-point annotation.\nSPTS v2 reserves the advantage of the auto-regressive Transformer with an\nInstance Assignment Decoder (IAD) through sequentially predicting the center\npoints of all text instances inside the same predicting sequence, while with a\nParallel Recognition Decoder (PRD) for text recognition in parallel. These two\ndecoders share the same parameters and are interactively connected with a\nsimple but effective information transmission process to pass the gradient and\ninformation. Comprehensive experiments on various existing benchmark datasets\ndemonstrate the SPTS v2 can outperform previous state-of-the-art single-point\ntext spotters with fewer parameters while achieving 19$\\times$ faster inference\nspeed. Within the context of our SPTS v2 framework, our experiments suggest a\npotential preference for single-point representation in scene text spotting\nwhen compared to other representations. Such an attempt provides a significant\nopportunity for scene text spotting applications beyond the realms of existing\nparadigms. Code is available at https://github.com/Yuliang-Liu/SPTSv2.\n","authors":["Yuliang Liu","Jiaxin Zhang","Dezhi Peng","Mingxin Huang","Xinyu Wang","Jingqun Tang","Can Huang","Dahua Lin","Chunhua Shen","Xiang Bai","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2301.01635v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2112.07917"},{"id":"http://arxiv.org/abs/2307.12450v2","updated":"2023-08-08T01:42:17Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v2.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2308.03286v2","updated":"2023-08-08T01:34:30Z","published":"2023-08-07T04:04:22Z","title":"Multi-Label Self-Supervised Learning with Scene Images","summary":"  Self-supervised learning (SSL) methods targeting scene images have seen a\nrapid growth recently, and they mostly rely on either a dedicated dense\nmatching mechanism or a costly unsupervised object discovery module. This paper\nshows that instead of hinging on these strenuous operations, quality image\nrepresentations can be learned by treating scene/multi-label image SSL simply\nas a multi-label classification problem, which greatly simplifies the learning\nframework. Specifically, multiple binary pseudo-labels are assigned for each\ninput image by comparing its embeddings with those in two dictionaries, and the\nnetwork is optimized using the binary cross entropy loss. The proposed method\nis named Multi-Label Self-supervised learning (MLS). Visualizations\nqualitatively show that clearly the pseudo-labels by MLS can automatically find\nsemantically similar pseudo-positive pairs across different images to\nfacilitate contrastive learning. MLS learns high quality representations on\nMS-COCO and achieves state-of-the-art results on classification, detection and\nsegmentation benchmarks. At the same time, MLS is much simpler than existing\nmethods, making it easier to deploy and for further exploration.\n","authors":["Ke Zhu","Minghao Fu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03286v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.03977v1","updated":"2023-08-08T01:33:13Z","published":"2023-08-08T01:33:13Z","title":"PUG: Photorealistic and Semantically Controllable Synthetic Data for\n  Representation Learning","summary":"  Synthetic image datasets offer unmatched advantages for designing and\nevaluating deep neural networks: they make it possible to (i) render as many\ndata samples as needed, (ii) precisely control each scene and yield granular\nground truth labels (and captions), (iii) precisely control distribution shifts\nbetween training and testing to isolate variables of interest for sound\nexperimentation. Despite such promise, the use of synthetic image data is still\nlimited -- and often played down -- mainly due to their lack of realism. Most\nworks therefore rely on datasets of real images, which have often been scraped\nfrom public images on the internet, and may have issues with regards to\nprivacy, bias, and copyright, while offering little control over how objects\nprecisely appear. In this work, we present a path to democratize the use of\nphotorealistic synthetic data: we develop a new generation of interactive\nenvironments for representation learning research, that offer both\ncontrollability and realism. We use the Unreal Engine, a powerful game engine\nwell known in the entertainment industry, to produce PUG (Photorealistic Unreal\nGraphics) environments and datasets for representation learning. In this paper,\nwe demonstrate the potential of PUG to enable more rigorous evaluations of\nvision models.\n","authors":["Florian Bordes","Shashank Shekhar","Mark Ibrahim","Diane Bouchacourt","Pascal Vincent","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02552v2","updated":"2023-08-08T01:30:26Z","published":"2023-08-02T03:34:44Z","title":"Degeneration-Tuning: Using Scrambled Grid shield Unwanted Concepts from\n  Stable Diffusion","summary":"  Owing to the unrestricted nature of the content in the training data, large\ntext-to-image diffusion models, such as Stable Diffusion (SD), are capable of\ngenerating images with potentially copyrighted or dangerous content based on\ncorresponding textual concepts information. This includes specific intellectual\nproperty (IP), human faces, and various artistic styles. However, Negative\nPrompt, a widely used method for content removal, frequently fails to conceal\nthis content due to inherent limitations in its inference logic. In this work,\nwe propose a novel strategy named \\textbf{Degeneration-Tuning (DT)} to shield\ncontents of unwanted concepts from SD weights. By utilizing Scrambled Grid to\nreconstruct the correlation between undesired concepts and their corresponding\nimage domain, we guide SD to generate meaningless content when such textual\nconcepts are provided as input. As this adaptation occurs at the level of the\nmodel's weights, the SD, after DT, can be grafted onto other conditional\ndiffusion frameworks like ControlNet to shield unwanted concepts. In addition\nto qualitatively showcasing the effectiveness of our DT method in protecting\nvarious types of concepts, a quantitative comparison of the SD before and after\nDT indicates that the DT method does not significantly impact the generative\nquality of other contents. The FID and IS scores of the model on COCO-30K\nexhibit only minor changes after DT, shifting from 12.61 and 39.20 to 13.04 and\n38.25, respectively, which clearly outperforms the previous methods.\n","authors":["Zixuan Ni","Longhui Wei","Jiacheng Li","Siliang Tang","Yueting Zhuang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.02552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03975v1","updated":"2023-08-08T01:27:55Z","published":"2023-08-08T01:27:55Z","title":"Prompted Contrast with Masked Motion Modeling: Towards Versatile 3D\n  Action Representation Learning","summary":"  Self-supervised learning has proved effective for skeleton-based human action\nunderstanding, which is an important yet challenging topic. Previous works\nmainly rely on contrastive learning or masked motion modeling paradigm to model\nthe skeleton relations. However, the sequence-level and joint-level\nrepresentation learning cannot be effectively and simultaneously handled by\nthese methods. As a result, the learned representations fail to generalize to\ndifferent downstream tasks. Moreover, combining these two paradigms in a naive\nmanner leaves the synergy between them untapped and can lead to interference in\ntraining. To address these problems, we propose Prompted Contrast with Masked\nMotion Modeling, PCM$^{\\rm 3}$, for versatile 3D action representation\nlearning. Our method integrates the contrastive learning and masked prediction\ntasks in a mutually beneficial manner, which substantially boosts the\ngeneralization capacity for various downstream tasks. Specifically, masked\nprediction provides novel training views for contrastive learning, which in\nturn guides the masked prediction training with high-level semantic\ninformation. Moreover, we propose a dual-prompted multi-task pretraining\nstrategy, which further improves model representations by reducing the\ninterference caused by learning the two different pretext tasks. Extensive\nexperiments on five downstream tasks under three large-scale datasets are\nconducted, demonstrating the superior generalization capacity of PCM$^{\\rm 3}$\ncompared to the state-of-the-art works. Our project is publicly available at:\nhttps://jhang2020.github.io/Projects/PCM3/PCM3.html .\n","authors":["Jiahang Zhang","Lilang Lin","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03975v1.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03968v1","updated":"2023-08-08T00:46:01Z","published":"2023-08-08T00:46:01Z","title":"CheXFusion: Effective Fusion of Multi-View Features using Transformers\n  for Long-Tailed Chest X-Ray Classification","summary":"  Medical image classification poses unique challenges due to the long-tailed\ndistribution of diseases, the co-occurrence of diagnostic findings, and the\nmultiple views available for each study or patient. This paper introduces our\nsolution to the ICCV CVAMD 2023 Shared Task on CXR-LT: Multi-Label Long-Tailed\nClassification on Chest X-Rays. Our approach introduces CheXFusion, a\ntransformer-based fusion module incorporating multi-view images. The fusion\nmodule, guided by self-attention and cross-attention mechanisms, efficiently\naggregates multi-view features while considering label co-occurrence.\nFurthermore, we explore data balancing and self-training methods to optimize\nthe model's performance. Our solution achieves state-of-the-art results with\n0.372 mAP in the MIMIC-CXR test set, securing 1st place in the competition. Our\nsuccess in the task underscores the significance of considering multi-view\nsettings, class imbalance, and label co-occurrence in medical image\nclassification. Public code is available at\nhttps://github.com/dongkyuk/CXR-LT-public-solution\n","authors":["Dongkyun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.03968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04622v1","updated":"2023-08-08T23:12:33Z","published":"2023-08-08T23:12:33Z","title":"Rendering Humans from Object-Occluded Monocular Videos","summary":"  3D understanding and rendering of moving humans from monocular videos is a\nchallenging task. Despite recent progress, the task remains difficult in\nreal-world scenarios, where obstacles may block the camera view and cause\npartial occlusions in the captured videos. Existing methods cannot handle such\ndefects due to two reasons. First, the standard rendering strategy relies on\npoint-point mapping, which could lead to dramatic disparities between the\nvisible and occluded areas of the body. Second, the naive direct regression\napproach does not consider any feasibility criteria (ie, prior information) for\nrendering under occlusions. To tackle the above drawbacks, we present OccNeRF,\na neural rendering method that achieves better rendering of humans in severely\noccluded scenes. As direct solutions to the two drawbacks, we propose\nsurface-based rendering by integrating geometry and visibility priors. We\nvalidate our method on both simulated and real-world occlusions and demonstrate\nour method's superiority.\n","authors":["Tiange Xiang","Adam Sun","Jiajun Wu","Ehsan Adeli","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2308.04622v1.pdf","comment":"ICCV 2023, project page:\n  https://cs.stanford.edu/~xtiange/projects/occnerf/"},{"id":"http://arxiv.org/abs/2308.04605v1","updated":"2023-08-08T22:10:29Z","published":"2023-08-08T22:10:29Z","title":"PSRFlow: Probabilistic Super Resolution with Flow-Based Models for\n  Scientific Data","summary":"  Although many deep-learning-based super-resolution approaches have been\nproposed in recent years, because no ground truth is available in the inference\nstage, few can quantify the errors and uncertainties of the super-resolved\nresults. For scientific visualization applications, however, conveying\nuncertainties of the results to scientists is crucial to avoid generating\nmisleading or incorrect information. In this paper, we propose PSRFlow, a novel\nnormalizing flow-based generative model for scientific data super-resolution\nthat incorporates uncertainty quantification into the super-resolution process.\nPSRFlow learns the conditional distribution of the high-resolution data based\non the low-resolution counterpart. By sampling from a Gaussian latent space\nthat captures the missing information in the high-resolution data, one can\ngenerate different plausible super-resolution outputs. The efficient sampling\nin the Gaussian latent space allows our model to perform uncertainty\nquantification for the super-resolved results. During model training, we\naugment the training data with samples across various scales to make the model\nadaptable to data of different scales, achieving flexible super-resolution for\na given input. Our results demonstrate superior performance and robust\nuncertainty quantification compared with existing methods such as interpolation\nand GAN-based super-resolution networks.\n","authors":["Jingyi Shen","Han-Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04605v1.pdf","comment":"To be published in Proc. IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.04598v1","updated":"2023-08-08T21:52:07Z","published":"2023-08-08T21:52:07Z","title":"1st Place Solution for CVPR2023 BURST Long Tail and Open World\n  Challenges","summary":"  Currently, Video Instance Segmentation (VIS) aims at segmenting and\ncategorizing objects in videos from a closed set of training categories that\ncontain only a few dozen of categories, lacking the ability to handle diverse\nobjects in real-world videos. As TAO and BURST datasets release, we have the\nopportunity to research VIS in long-tailed and open-world scenarios.\nTraditional VIS methods are evaluated on benchmarks limited to a small number\nof common classes, But practical applications require trackers that go beyond\nthese common classes, detecting and tracking rare and even never-before-seen\nobjects. Inspired by the latest MOT paper for the long tail task (Tracking\nEvery Thing in the Wild, Siyuan Li et), for the BURST long tail challenge, we\ntrain our model on a combination of LVISv0.5 and the COCO dataset using repeat\nfactor sampling. First, train the detector with segmentation and CEM on\nLVISv0.5 + COCO dataset. And then, train the instance appearance similarity\nhead on the TAO dataset. at last, our method (LeTracker) gets 14.9 HOTAall in\nthe BURST test set, ranking 1st in the benchmark. for the open-world\nchallenges, we only use 64 classes (Intersection classes of BURST Train subset\nand COCO dataset, without LVIS dataset) annotations data training, and testing\non BURST test set data and get 61.4 OWTAall, ranking 1st in the benchmark. Our\ncode will be released to facilitate future research.\n","authors":["Kaer Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04589v1","updated":"2023-08-08T21:18:23Z","published":"2023-08-08T21:18:23Z","title":"Temporal DINO: A Self-supervised Video Strategy to Enhance Action\n  Prediction","summary":"  The emerging field of action prediction plays a vital role in various\ncomputer vision applications such as autonomous driving, activity analysis and\nhuman-computer interaction. Despite significant advancements, accurately\npredicting future actions remains a challenging problem due to high\ndimensionality, complex dynamics and uncertainties inherent in video data.\nTraditional supervised approaches require large amounts of labelled data, which\nis expensive and time-consuming to obtain. This paper introduces a novel\nself-supervised video strategy for enhancing action prediction inspired by DINO\n(self-distillation with no labels). The Temporal-DINO approach employs two\nmodels; a 'student' processing past frames; and a 'teacher' processing both\npast and future frames, enabling a broader temporal context. During training,\nthe teacher guides the student to learn future context by only observing past\nframes. The strategy is evaluated on ROAD dataset for the action prediction\ndownstream task using 3D-ResNet, Transformer, and LSTM architectures. The\nexperimental results showcase significant improvements in prediction\nperformance across these architectures, with our method achieving an average\nenhancement of 9.9% Precision Points (PP), highlighting its effectiveness in\nenhancing the backbones' capabilities of capturing long-term dependencies.\nFurthermore, our approach demonstrates efficiency regarding the pretraining\ndataset size and the number of epochs required. This method overcomes\nlimitations present in other approaches, including considering various backbone\narchitectures, addressing multiple prediction horizons, reducing reliance on\nhand-crafted augmentations, and streamlining the pretraining process into a\nsingle stage. These findings highlight the potential of our approach in diverse\nvideo-based tasks such as activity recognition, motion planning, and scene\nunderstanding.\n","authors":["Izzeddin Teeti","Rongali Sai Bhargav","Vivek Singh","Andrew Bradley","Biplab Banerjee","Fabio Cuzzolin"],"pdf_url":"https://arxiv.org/pdf/2308.04589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04583v1","updated":"2023-08-08T21:08:42Z","published":"2023-08-08T21:08:42Z","title":"LATR: 3D Lane Detection from Monocular Images with Transformer","summary":"  3D lane detection from monocular images is a fundamental yet challenging task\nin autonomous driving. Recent advances primarily rely on structural 3D\nsurrogates (e.g., bird's eye view) that are built from front-view image\nfeatures and camera parameters. However, the depth ambiguity in monocular\nimages inevitably causes misalignment between the constructed surrogate feature\nmap and the original image, posing a great challenge for accurate lane\ndetection. To address the above issue, we present a novel LATR model, an\nend-to-end 3D lane detector that uses 3D-aware front-view features without\ntransformed view representation. Specifically, LATR detects 3D lanes via\ncross-attention based on query and key-value pairs, constructed using our\nlane-aware query generator and dynamic 3D ground positional embedding. On the\none hand, each query is generated based on 2D lane-aware features and adopts a\nhybrid embedding to enhance the lane information. On the other hand, 3D space\ninformation is injected as positional embedding from an iteratively-updated 3D\nground plane. LATR outperforms previous state-of-the-art methods on both\nsynthetic Apollo and realistic OpenLane by large margins (e.g., 11.4 gains in\nterms of F1 score on OpenLane). Code will be released at\nhttps://github.com/JMoonr/LATR.\n","authors":["Yueru Luo","Chaoda Zheng","Xu Yan","Tang Kun","Chao Zheng","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2308.04583v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2303.06457v3","updated":"2023-08-08T21:00:21Z","published":"2023-03-11T17:14:30Z","title":"Active Visual Exploration Based on Attention-Map Entropy","summary":"  Active visual exploration addresses the issue of limited sensor capabilities\nin real-world scenarios, where successive observations are actively chosen\nbased on the environment. To tackle this problem, we introduce a new technique\ncalled Attention-Map Entropy (AME). It leverages the internal uncertainty of\nthe transformer-based model to determine the most informative observations. In\ncontrast to existing solutions, it does not require additional loss components,\nwhich simplifies the training. Through experiments, which also mimic\nretina-like sensors, we show that such simplified training significantly\nimproves the performance of reconstruction, segmentation and classification on\npublicly available datasets.\n","authors":["Adam Pardyl","Grzegorz Rypeść","Grzegorz Kurzejamski","Bartosz Zieliński","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2303.06457v3.pdf","comment":"IJCAI 2023"},{"id":"http://arxiv.org/abs/2209.05996v3","updated":"2023-08-08T20:52:26Z","published":"2022-09-13T13:45:18Z","title":"M$^2$-3DLaneNet: Exploring Multi-Modal 3D Lane Detection","summary":"  Estimating accurate lane lines in 3D space remains challenging due to their\nsparse and slim nature. Previous works mainly focused on using images for 3D\nlane detection, leading to inherent projection error and loss of geometry\ninformation. To address these issues, we explore the potential of leveraging\nLiDAR for 3D lane detection, either as a standalone method or in combination\nwith existing monocular approaches. In this paper, we propose M$^2$-3DLaneNet\nto integrate complementary information from multiple sensors. Specifically,\nM$^2$-3DLaneNet lifts 2D features into 3D space by incorporating geometry\ninformation from LiDAR data through depth completion. Subsequently, the lifted\n2D features are further enhanced with LiDAR features through cross-modality BEV\nfusion. Extensive experiments on the large-scale OpenLane dataset demonstrate\nthe effectiveness of M$^2$-3DLaneNet, regardless of the range (75m or 100m).\n","authors":["Yueru Luo","Xu Yan","Chaoda Zheng","Chao Zheng","Shuqi Mei","Tang Kun","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2209.05996v3.pdf","comment":"update"},{"id":"http://arxiv.org/abs/2308.04571v1","updated":"2023-08-08T20:36:59Z","published":"2023-08-08T20:36:59Z","title":"Optimizing Algorithms From Pairwise User Preferences","summary":"  Typical black-box optimization approaches in robotics focus on learning from\nmetric scores. However, that is not always possible, as not all developers have\nground truth available. Learning appropriate robot behavior in human-centric\ncontexts often requires querying users, who typically cannot provide precise\nmetric scores. Existing approaches leverage human feedback in an attempt to\nmodel an implicit reward function; however, this reward may be difficult or\nimpossible to effectively capture. In this work, we introduce SortCMA to\noptimize algorithm parameter configurations in high dimensions based on\npairwise user preferences. SortCMA efficiently and robustly leverages user\ninput to find parameter sets without directly modeling a reward. We apply this\nmethod to tuning a commercial depth sensor without ground truth, and to robot\nsocial navigation, which involves highly complex preferences over robot\nbehavior. We show that our method succeeds in optimizing for the user's goals\nand perform a user study to evaluate social navigation results.\n","authors":["Leonid Keselman","Katherine Shih","Martial Hebert","Aaron Steinfeld"],"pdf_url":"https://arxiv.org/pdf/2308.04571v1.pdf","comment":"Accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2308.04556v1","updated":"2023-08-08T20:06:12Z","published":"2023-08-08T20:06:12Z","title":"FocalFormer3D : Focusing on Hard Instance for 3D Object Detection","summary":"  False negatives (FN) in 3D object detection, {\\em e.g.}, missing predictions\nof pedestrians, vehicles, or other obstacles, can lead to potentially dangerous\nsituations in autonomous driving. While being fatal, this issue is understudied\nin many current 3D detection methods. In this work, we propose Hard Instance\nProbing (HIP), a general pipeline that identifies \\textit{FN} in a multi-stage\nmanner and guides the models to focus on excavating difficult instances. For 3D\nobject detection, we instantiate this method as FocalFormer3D, a simple yet\neffective detector that excels at excavating difficult objects and improving\nprediction recall. FocalFormer3D features a multi-stage query generation to\ndiscover hard objects and a box-level transformer decoder to efficiently\ndistinguish objects from massive object candidates. Experimental results on the\nnuScenes and Waymo datasets validate the superior performance of FocalFormer3D.\nThe advantage leads to strong performance on both detection and tracking, in\nboth LiDAR and multi-modal settings. Notably, FocalFormer3D achieves a 70.5 mAP\nand 73.9 NDS on nuScenes detection benchmark, while the nuScenes tracking\nbenchmark shows 72.1 AMOTA, both ranking 1st place on the nuScenes LiDAR\nleaderboard. Our code is available at\n\\url{https://github.com/NVlabs/FocalFormer3D}.\n","authors":["Yilun Chen","Zhiding Yu","Yukang Chen","Shiyi Lan","Animashree Anandkumar","Jiaya Jia","Jose Alvarez"],"pdf_url":"https://arxiv.org/pdf/2308.04556v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04553v1","updated":"2023-08-08T19:52:28Z","published":"2023-08-08T19:52:28Z","title":"From Fake to Real (FFR): A two-stage training pipeline for mitigating\n  spurious correlations with synthetic data","summary":"  Visual recognition models are prone to learning spurious correlations induced\nby an imbalanced training set where certain groups (\\eg Females) are\nunder-represented in certain classes (\\eg Programmers). Generative models offer\na promising direction in mitigating this bias by generating synthetic data for\nthe minority samples and thus balancing the training set. However, prior work\nthat uses these approaches overlooks that visual recognition models could often\nlearn to differentiate between real and synthetic images and thus fail to\nunlearn the bias in the original dataset. In our work, we propose a novel\ntwo-stage pipeline to mitigate this issue where 1) we pre-train a model on a\nbalanced synthetic dataset and then 2) fine-tune on the real data. Using this\npipeline, we avoid training on both real and synthetic data, thus avoiding the\nbias between real and synthetic data. Moreover, we learn robust features\nagainst the bias in the first step that mitigate the bias in the second step.\nMoreover, our pipeline naturally integrates with bias mitigation methods; they\ncan be simply applied to the fine-tuning step. As our experiments prove, our\npipeline can further improve the performance of bias mitigation methods\nobtaining state-of-the-art performance on three large-scale datasets.\n","authors":["Maan Qraitem","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2308.04553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04551v1","updated":"2023-08-08T19:45:06Z","published":"2023-08-08T19:45:06Z","title":"Improving Medical Image Classification in Noisy Labels Using Only\n  Self-supervised Pretraining","summary":"  Noisy labels hurt deep learning-based supervised image classification\nperformance as the models may overfit the noise and learn corrupted feature\nextractors. For natural image classification training with noisy labeled data,\nmodel initialization with contrastive self-supervised pretrained weights has\nshown to reduce feature corruption and improve classification performance.\nHowever, no works have explored: i) how other self-supervised approaches, such\nas pretext task-based pretraining, impact the learning with noisy label, and\nii) any self-supervised pretraining methods alone for medical images in noisy\nlabel settings. Medical images often feature smaller datasets and subtle inter\nclass variations, requiring human expertise to ensure correct classification.\nThus, it is not clear if the methods improving learning with noisy labels in\nnatural image datasets such as CIFAR would also help with medical images. In\nthis work, we explore contrastive and pretext task-based self-supervised\npretraining to initialize the weights of a deep learning classification model\nfor two medical datasets with self-induced noisy labels -- NCT-CRC-HE-100K\ntissue histological images and COVID-QU-Ex chest X-ray images. Our results show\nthat models initialized with pretrained weights obtained from self-supervised\nlearning can effectively learn better features and improve robustness against\nnoisy labels.\n","authors":["Bidur Khanal","Binod Bhattarai","Bishesh Khanal","Cristian A. Linte"],"pdf_url":"https://arxiv.org/pdf/2308.04551v1.pdf","comment":"Accepted at MICCAI 2023 DEMI Workshop"},{"id":"http://arxiv.org/abs/2308.04549v1","updated":"2023-08-08T19:38:15Z","published":"2023-08-08T19:38:15Z","title":"Prune Spatio-temporal Tokens by Semantic-aware Temporal Accumulation","summary":"  Transformers have become the primary backbone of the computer vision\ncommunity due to their impressive performance. However, the unfriendly\ncomputation cost impedes their potential in the video recognition domain. To\noptimize the speed-accuracy trade-off, we propose Semantic-aware Temporal\nAccumulation score (STA) to prune spatio-temporal tokens integrally. STA score\nconsiders two critical factors: temporal redundancy and semantic importance.\nThe former depicts a specific region based on whether it is a new occurrence or\na seen entity by aggregating token-to-token similarity in consecutive frames\nwhile the latter evaluates each token based on its contribution to the overall\nprediction. As a result, tokens with higher scores of STA carry more temporal\nredundancy as well as lower semantics thus being pruned. Based on the STA\nscore, we are able to progressively prune the tokens without introducing any\nadditional parameters or requiring further re-training. We directly apply the\nSTA module to off-the-shelf ViT and VideoSwin backbones, and the empirical\nresults on Kinetics-400 and Something-Something V2 achieve over 30% computation\nreduction with a negligible ~0.2% accuracy drop. The code is released at\nhttps://github.com/Mark12Ding/STA.\n","authors":["Shuangrui Ding","Peisen Zhao","Xiaopeng Zhang","Rui Qian","Hongkai Xiong","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04549v1.pdf","comment":"ICCV 2023 camera ready"},{"id":"http://arxiv.org/abs/2307.10763v2","updated":"2023-08-08T19:31:20Z","published":"2023-07-20T10:53:12Z","title":"Actor-agnostic Multi-label Action Recognition with Multi-modal Query","summary":"  Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v2.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n  Vision Workshops (ICCVW), Paris, France"},{"id":"http://arxiv.org/abs/2308.04542v1","updated":"2023-08-08T19:18:20Z","published":"2023-08-08T19:18:20Z","title":"YUDO: YOLO for Uniform Directed Object Detection","summary":"  This paper presents an efficient way of detecting directed objects by\npredicting their center coordinates and direction angle. Since the objects are\nof uniform size, the proposed model works without predicting the object's width\nand height. The dataset used for this problem is presented in Honeybee\nSegmentation and Tracking Datasets project. One of the contributions of this\nwork is an examination of the ability of the standard real-time object\ndetection architecture like YoloV7 to be customized for position and direction\ndetection. A very efficient, tiny version of the architecture is used in this\napproach. Moreover, only one of three detection heads without anchors is\nsufficient for this task. We also introduce the extended Skew Intersection over\nUnion (SkewIoU) calculation for rotated boxes - directed IoU (DirIoU), which\nincludes an absolute angle difference. DirIoU is used both in the matching\nprocedure of target and predicted bounding boxes for mAP calculation, and in\nthe NMS filtering procedure. The code and models are available at\nhttps://github.com/djordjened92/yudo.\n","authors":["Đorđe Nedeljković"],"pdf_url":"https://arxiv.org/pdf/2308.04542v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2303.09472v2","updated":"2023-08-08T19:15:38Z","published":"2023-03-16T16:47:14Z","title":"DiffIR: Efficient Diffusion Model for Image Restoration","summary":"  Diffusion model (DM) has achieved SOTA performance by modeling the image\nsynthesis process into a sequential application of a denoising network.\nHowever, different from image synthesis, image restoration (IR) has a strong\nconstraint to generate results in accordance with ground-truth. Thus, for IR,\ntraditional DMs running massive iterations on a large model to estimate whole\nimages or feature maps is inefficient. To address this issue, we propose an\nefficient DM for IR (DiffIR), which consists of a compact IR prior extraction\nnetwork (CPEN), dynamic IR transformer (DIRformer), and denoising network.\nSpecifically, DiffIR has two training stages: pretraining and training DM. In\npretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact\nIR prior representation (IPR) to guide DIRformer. In the second stage, we train\nthe DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using\nLQ images. We observe that since the IPR is only a compact vector, DiffIR can\nuse fewer iterations than traditional DM to obtain accurate estimations and\ngenerate more stable and realistic results. Since the iterations are few, our\nDiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising\nnetwork, which can further reduce the estimation error influence. We conduct\nextensive experiments on several IR tasks and achieve SOTA performance while\nconsuming less computational costs. Code is available at\n\\url{https://github.com/Zj-BinXia/DiffIR}.\n","authors":["Bin Xia","Yulun Zhang","Shiyin Wang","Yitong Wang","Xinglong Wu","Yapeng Tian","Wenming Yang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.09472v2.pdf","comment":"This paper is accepted by ICCV2023. Codes and models are available at\n  https://github.com/Zj-BinXia/DiffIR"},{"id":"http://arxiv.org/abs/2308.04536v1","updated":"2023-08-08T18:57:03Z","published":"2023-08-08T18:57:03Z","title":"Facial Prior Based First Order Motion Model for Micro-expression\n  Generation","summary":"  Spotting facial micro-expression from videos finds various potential\napplications in fields including clinical diagnosis and interrogation,\nmeanwhile this task is still difficult due to the limited scale of training\ndata. To solve this problem, this paper tries to formulate a new task called\nmicro-expression generation and then presents a strong baseline which combines\nthe first order motion model with facial prior knowledge. Given a target face,\nwe intend to drive the face to generate micro-expression videos according to\nthe motion patterns of source videos. Specifically, our new model involves\nthree modules. First, we extract facial prior features from a region focusing\nmodule. Second, we estimate facial motion using key points and local affine\ntransformations with a motion prediction module. Third, expression generation\nmodule is used to drive the target face to generate videos. We train our model\non public CASME II, SAMM and SMIC datasets and then use the model to generate\nnew micro-expression videos for evaluation. Our model achieves the first place\nin the Facial Micro-Expression Challenge 2021 (MEGC2021), where our superior\nperformance is verified by three experts with Facial Action Coding System\ncertification. Source code is provided in\nhttps://github.com/Necolizer/Facial-Prior-Based-FOMM.\n","authors":["Yi Zhang","Youjun Zhao","Yuhang Wen","Zixuan Tang","Xinhua Xu","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04536v1.pdf","comment":"ACM Multimedia 2021"},{"id":"http://arxiv.org/abs/2308.04535v1","updated":"2023-08-08T18:57:01Z","published":"2023-08-08T18:57:01Z","title":"Estimation of Human Condition at Disaster Site Using Aerial Drone Images","summary":"  Drones are being used to assess the situation in various disasters. In this\nstudy, we investigate a method to automatically estimate the damage status of\npeople based on their actions in aerial drone images in order to understand\ndisaster sites faster and save labor. We constructed a new dataset of aerial\nimages of human actions in a hypothetical disaster that occurred in an urban\narea, and classified the human damage status using 3D ResNet. The results\nshowed that the status with characteristic human actions could be classified\nwith a recall rate of more than 80%, while other statuses with similar human\nactions could only be classified with a recall rate of about 50%. In addition,\na cloud-based VR presentation application suggested the effectiveness of using\ndrones to understand the disaster site and estimate the human condition.\n","authors":["Tomoki Arai","Kenji Iwata","Kensho Hara","Yutaka Satoh"],"pdf_url":"https://arxiv.org/pdf/2308.04535v1.pdf","comment":"In submission to the ICCV 2023 Artificial Intelligence for\n  Humanitarian Assistance and Disaster Response Workshop"},{"id":"http://arxiv.org/abs/2305.07026v3","updated":"2023-08-08T18:50:07Z","published":"2023-05-11T17:58:47Z","title":"Decentralization and Acceleration Enables Large-Scale Bundle Adjustment","summary":"  Scaling to arbitrarily large bundle adjustment problems requires data and\ncompute to be distributed across multiple devices. Centralized methods in prior\nworks are only able to solve small or medium size problems due to overhead in\ncomputation and communication. In this paper, we present a fully decentralized\nmethod that alleviates computation and communication bottlenecks to solve\narbitrarily large bundle adjustment problems. We achieve this by reformulating\nthe reprojection error and deriving a novel surrogate function that decouples\noptimization variables from different devices. This function makes it possible\nto use majorization minimization techniques and reduces bundle adjustment to\nindependent optimization subproblems that can be solved in parallel. We further\napply Nesterov's acceleration and adaptive restart to improve convergence while\nmaintaining its theoretical guarantees. Despite limited peer-to-peer\ncommunication, our method has provable convergence to first-order critical\npoints under mild conditions. On extensive benchmarks with public datasets, our\nmethod converges much faster than decentralized baselines with similar memory\nusage and communication load. Compared to centralized baselines using a single\ndevice, our method, while being decentralized, yields more accurate solutions\nwith significant speedups of up to 953.7x over Ceres and 174.6x over DeepLM.\nCode: https://joeaortiz.github.io/daba.\n","authors":["Taosha Fan","Joseph Ortiz","Ming Hsiao","Maurizio Monge","Jing Dong","Todd Murphey","Mustafa Mukadam"],"pdf_url":"https://arxiv.org/pdf/2305.07026v3.pdf","comment":"Robotics: Science and Systems (RSS), 2023"},{"id":"http://arxiv.org/abs/2209.00128v3","updated":"2023-08-08T18:48:21Z","published":"2022-08-31T21:45:16Z","title":"Archangel: A Hybrid UAV-based Human Detection Benchmark with Position\n  and Pose Metadata","summary":"  Learning to detect objects, such as humans, in imagery captured by an\nunmanned aerial vehicle (UAV) usually suffers from tremendous variations caused\nby the UAV's position towards the objects. In addition, existing UAV-based\nbenchmark datasets do not provide adequate dataset metadata, which is essential\nfor precise model diagnosis and learning features invariant to those\nvariations. In this paper, we introduce Archangel, the first UAV-based object\ndetection dataset composed of real and synthetic subsets captured with similar\nimagining conditions and UAV position and object pose metadata. A series of\nexperiments are carefully designed with a state-of-the-art object detector to\ndemonstrate the benefits of leveraging the metadata during model evaluation.\nMoreover, several crucial insights involving both real and synthetic data\nduring model optimization are presented. In the end, we discuss the advantages,\nlimitations, and future directions regarding Archangel to highlight its\ndistinct value for the broader machine learning community.\n","authors":["Yi-Ting Shen","Yaesop Lee","Heesung Kwon","Damon M. Conover","Shuvra S. Bhattacharyya","Nikolas Vale","Joshua D. Gray","G. Jeremy Leong","Kenneth Evensen","Frank Skirlo"],"pdf_url":"https://arxiv.org/pdf/2209.00128v3.pdf","comment":"IEEE Access"},{"id":"http://arxiv.org/abs/2308.04529v1","updated":"2023-08-08T18:47:25Z","published":"2023-08-08T18:47:25Z","title":"Generating Modern Persian Carpet Map by Style-transfer","summary":"  Today, the great performance of Deep Neural Networks(DNN) has been proven in\nvarious fields. One of its most attractive applications is to produce artistic\ndesigns. A carpet that is known as a piece of art is one of the most important\nitems in a house, which has many enthusiasts all over the world. The first\nstage of producing a carpet is to prepare its map, which is a difficult,\ntime-consuming, and expensive task. In this research work, our purpose is to\nuse DNN for generating a Modern Persian Carpet Map. To reach this aim, three\ndifferent DNN style transfer methods are proposed and compared against each\nother. In the proposed methods, the Style-Swap method is utilized to create the\ninitial carpet map, and in the following, to generate more diverse designs,\nmethods Clip-Styler, Gatys, and Style-Swap are used separately. In addition,\nsome methods are examined and introduced for coloring the produced carpet maps.\nThe designed maps are evaluated via the results of filled questionnaires where\nthe outcomes of user evaluations confirm the popularity of generated carpet\nmaps. Eventually, for the first time, intelligent methods are used in producing\ncarpet maps, and it reduces human intervention. The proposed methods can\nsuccessfully produce diverse carpet designs, and at a higher speed than\ntraditional ways.\n","authors":["Dorsa Rahmatian","Monireh Moshavash","Mahdi Eftekhari","Kamran Hoseinkhani"],"pdf_url":"https://arxiv.org/pdf/2308.04529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04528v1","updated":"2023-08-08T18:46:16Z","published":"2023-08-08T18:46:16Z","title":"Unsupervised Camouflaged Object Segmentation as Domain Adaptation","summary":"  Deep learning for unsupervised image segmentation remains challenging due to\nthe absence of human labels. The common idea is to train a segmentation head,\nwith the supervision of pixel-wise pseudo-labels generated based on the\nrepresentation of self-supervised backbones. By doing so, the model performance\ndepends much on the distance between the distributions of target datasets and\nthe pre-training dataset (e.g., ImageNet). In this work, we investigate a new\ntask, namely unsupervised camouflaged object segmentation (UCOS), where the\ntarget objects own a common rarely-seen attribute, i.e., camouflage.\nUnsurprisingly, we find that the state-of-the-art unsupervised models struggle\nin adapting UCOS, due to the domain gap between the properties of generic and\ncamouflaged objects. To this end, we formulate the UCOS as a source-free\nunsupervised domain adaptation task (UCOS-DA), where both source labels and\ntarget labels are absent during the whole model training process. Specifically,\nwe define a source model consisting of self-supervised vision transformers\npre-trained on ImageNet. On the other hand, the target domain includes a simple\nlinear layer (i.e., our target model) and unlabeled camouflaged objects. We\nthen design a pipeline for foreground-background-contrastive self-adversarial\ndomain adaptation, to achieve robust UCOS. As a result, our baseline model\nachieves superior segmentation performance when compared with competing\nunsupervised models on the UCOS benchmark, with the training set which's scale\nis only one tenth of the supervised COS counterpart.\n","authors":["Yi Zhang","Chengyi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.04528v1.pdf","comment":"12 pages, 6 figures, 3 tables; Project Page:\n  https://github.com/Jun-Pu/UCOS-DA ; Accepted to ICCV 2023 Workshop on OOD-CV"},{"id":"http://arxiv.org/abs/2308.04526v1","updated":"2023-08-08T18:41:38Z","published":"2023-08-08T18:41:38Z","title":"Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours\n  Maps","summary":"  In this work, we describe a method for large-scale 3D cell-tracking through a\nsegmentation selection approach. The proposed method is effective at tracking\ncells across large microscopy datasets on two fronts: (i) It can solve problems\ncontaining millions of segmentation instances in terabyte-scale 3D+t datasets;\n(ii) It achieves competitive results with or without deep learning, which\nrequires 3D annotated data, that is scarce in the fluorescence microscopy\nfield. The proposed method computes cell tracks and segments using a hierarchy\nof segmentation hypotheses and selects disjoint segments by maximizing the\noverlap between adjacent frames. We show that this method achieves\nstate-of-the-art results in 3D images from the cell tracking challenge and has\na faster integer linear programming formulation. Moreover, our framework is\nflexible and supports segmentations from off-the-shelf cell segmentation models\nand can combine them into an ensemble that improves tracking. The code is\navailable https://github.com/royerlab/ultrack.\n","authors":["Jordão Bragantini","Merlin Lange","Loïc Royer"],"pdf_url":"https://arxiv.org/pdf/2308.04526v1.pdf","comment":"13 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.04515v1","updated":"2023-08-08T18:24:53Z","published":"2023-08-08T18:24:53Z","title":"Toward unlabeled multi-view 3D pedestrian detection by generalizable AI:\n  techniques and performance analysis","summary":"  We unveil how generalizable AI can be used to improve multi-view 3D\npedestrian detection in unlabeled target scenes. One way to increase\ngeneralization to new scenes is to automatically label target data, which can\nthen be used for training a detector model. In this context, we investigate two\napproaches for automatically labeling target data: pseudo-labeling using a\nsupervised detector and automatic labeling using an untrained detector (that\ncan be applied out of the box without any training). We adopt a training\nframework for optimizing detector models using automatic labeling procedures.\nThis framework encompasses different training sets/modes and multi-round\nautomatic labeling strategies. We conduct our analyses on the\npublicly-available WILDTRACK and MultiviewX datasets. We show that, by using\nthe automatic labeling approach based on an untrained detector, we can obtain\nsuperior results than directly using the untrained detector or a detector\ntrained with an existing labeled source dataset. It achieved a MODA about 4%\nand 1% better than the best existing unlabeled method when using WILDTRACK and\nMultiviewX as target datasets, respectively.\n","authors":["João Paulo Lima","Diego Thomas","Hideaki Uchiyama","Veronica Teichrieb"],"pdf_url":"https://arxiv.org/pdf/2308.04515v1.pdf","comment":"Accepted to SIBGRAPI 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2210.07774v3","updated":"2023-08-08T17:48:12Z","published":"2022-09-19T22:57:10Z","title":"Learning To Rank Diversely At Airbnb","summary":"  Airbnb is a two-sided marketplace, bringing together hosts who own listings\nfor rent, with prospective guests from around the globe. Applying neural\nnetwork-based learning to rank techniques has led to significant improvements\nin matching guests with hosts. These improvements in ranking were driven by a\ncore strategy: order the listings by their estimated booking probabilities,\nthen iterate on techniques to make these booking probability estimates more and\nmore accurate. Embedded implicitly in this strategy was an assumption that the\nbooking probability of a listing could be determined independently of other\nlistings in search results. In this paper we discuss how this assumption,\npervasive throughout the commonly-used learning to rank frameworks, is false.\nWe provide a theoretical foundation correcting this assumption, followed by\nefficient neural network architectures based on the theory. Explicitly\naccounting for possible similarities between listings, and reducing them to\ndiversify the search results generated strong positive impact. We discuss these\nmetric wins as part of the online A/B tests of the theory. Our method provides\na practical way to diversify search results for large-scale production ranking\nsystems.\n","authors":["Malay Haldar","Mustafa Abdool","Liwei He","Dillon Davis","Huiji Gao","Sanjeev Katariya"],"pdf_url":"https://arxiv.org/pdf/2210.07774v3.pdf","comment":"Search ranking, Diversity, e-commerce"},{"id":"http://arxiv.org/abs/2112.06668v2","updated":"2023-08-08T16:32:12Z","published":"2021-12-13T13:42:35Z","title":"CT4Rec: Simple yet Effective Consistency Training for Sequential\n  Recommendation","summary":"  Sequential recommendation methods play an important role in real-world\nrecommender systems. These systems are able to catch user preferences by taking\nadvantage of historical records and then performing recommendations.\nContrastive learning(CL) is a cutting-edge technology that can assist us in\nobtaining informative user representations, but these CL-based models need\nsubtle negative sampling strategies, tedious data augmentation methods, and\nheavy hyper-parameters tuning work. In this paper, we introduce another way to\ngenerate better user representations and recommend more attractive items to\nusers. Particularly, we put forward an effective \\textbf{C}onsistency\n\\textbf{C}onstraint for sequential \\textbf{Rec}ommendation(C$^2$-Rec) in which\nonly two extra training objectives are used without any structural\nmodifications and data augmentation strategies. Substantial experiments have\nbeen conducted on three benchmark datasets and one real industrial dataset,\nwhich proves that our proposed method outperforms SOTA models substantially.\nFurthermore, our method needs much less training time than those CL-based\nmodels. Online AB-test on real-world recommendation systems also achieves\n10.141\\% improvement on the click-through rate and 10.541\\% increase on the\naverage click number per capita. The code is available at\n\\url{https://github.com/zhengrongqin/C2-Rec}.\n","authors":["Chong Liu","Xiaoyang Liu","Rongqin Zheng","Lixin Zhang","Xiaobo Liang","Juntao Li","Lijun Wu","Min Zhang","Leyu Lin"],"pdf_url":"https://arxiv.org/pdf/2112.06668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03735v2","updated":"2023-08-08T16:20:18Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v2.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04258v1","updated":"2023-08-08T13:46:55Z","published":"2023-08-08T13:46:55Z","title":"Advancing Natural-Language Based Audio Retrieval with PaSST and Large\n  Audio-Caption Data Sets","summary":"  This work presents a text-to-audio-retrieval system based on pre-trained text\nand spectrogram transformers. Our method projects recordings and textual\ndescriptions into a shared audio-caption space in which related examples from\ndifferent modalities are close. Through a systematic analysis, we examine how\neach component of the system influences retrieval performance. As a result, we\nidentify two key components that play a crucial role in driving performance:\nthe self-attention-based audio encoder for audio embedding and the utilization\nof additional human-generated and synthetic data sets during pre-training. We\nfurther experimented with augmenting ClothoV2 captions with available keywords\nto increase their variety; however, this only led to marginal improvements. Our\nsystem ranked first in the 2023's DCASE Challenge, and it outperforms the\ncurrent state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.\n","authors":["Paul Primus","Khaled Koutini","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2308.04258v1.pdf","comment":"submitted to DCASE Workshop 2023"},{"id":"http://arxiv.org/abs/2308.04247v1","updated":"2023-08-08T13:26:36Z","published":"2023-08-08T13:26:36Z","title":"UniRecSys: A Unified Framework for Personalized, Group, Package, and\n  Package-to-Group Recommendations","summary":"  Recommender systems aim to enhance the overall user experience by providing\ntailored recommendations for a variety of products and services. These systems\nhelp users make more informed decisions, leading to greater user satisfaction\nwith the platform. However, the implementation of these systems largely depends\non the context, which can vary from recommending an item or package to a user\nor a group. This requires careful exploration of several models during the\ndeployment, as there is no comprehensive and unified approach that deals with\nrecommendations at different levels. Furthermore, these individual models must\nbe closely attuned to their generated recommendations depending on the context\nto prevent significant variation in their generated recommendations. In this\npaper, we propose a novel unified recommendation framework that addresses all\nfour recommendation tasks, namely personalized, group, package, or\npackage-to-group recommendation, filling the gap in the current research\nlandscape. The proposed framework can be integrated with most of the\ntraditional matrix factorization-based collaborative filtering models. The idea\nis to enhance the formulation of the existing approaches by incorporating\ncomponents focusing on the exploitation of the group and package latent\nfactors. These components also help in exploiting a rich latent representation\nof the user/item by enforcing them to align closely with their corresponding\ngroup/package representation. We consider two prominent CF techniques,\nRegularized Matrix Factorization and Maximum Margin Matrix factorization, as\nthe baseline models and demonstrate their customization to various\nrecommendation tasks. Experiment results on two publicly available datasets are\nreported, comparing them to other baseline approaches that consider individual\nrating feedback for group or package recommendations.\n","authors":["Adamya Shyam","Vikas Kumar","Venkateswara Rao Kagita","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2308.04247v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.04226v1","updated":"2023-08-08T12:45:01Z","published":"2023-08-08T12:45:01Z","title":"OpinionConv: Conversational Product Search with Grounded Opinions","summary":"  When searching for products, the opinions of others play an important role in\nmaking informed decisions. Subjective experiences about a product can be a\nvaluable source of information. This is also true in sales conversations, where\na customer and a sales assistant exchange facts and opinions about products.\nHowever, training an AI for such conversations is complicated by the fact that\nlanguage models do not possess authentic opinions for their lack of real-world\nexperience. We address this problem by leveraging product reviews as a rich\nsource of product opinions to ground conversational AI in true subjective\nnarratives. With OpinionConv, we develop the first conversational AI for\nsimulating sales conversations. To validate the generated conversations, we\nconduct several user studies showing that the generated opinions are perceived\nas realistic. Our assessors also confirm the importance of opinions as an\ninformative basis for decision-making.\n","authors":["Vahid Sadiri Javadi","Martin Potthast","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.04226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10046v2","updated":"2023-08-08T09:46:21Z","published":"2023-06-12T08:21:50Z","title":"Document Layout Annotation: Database and Benchmark in the Domain of\n  Public Affairs","summary":"  Every day, thousands of digital documents are generated with useful\ninformation for companies, public organizations, and citizens. Given the\nimpossibility of processing them manually, the automatic processing of these\ndocuments is becoming increasingly necessary in certain sectors. However, this\ntask remains challenging, since in most cases a text-only based parsing is not\nenough to fully understand the information presented through different\ncomponents of varying significance. In this regard, Document Layout Analysis\n(DLA) has been an interesting research field for many years, which aims to\ndetect and classify the basic components of a document. In this work, we used a\nprocedure to semi-automatically annotate digital documents with different\nlayout labels, including 4 basic layout blocks and 4 text categories. We apply\nthis procedure to collect a novel database for DLA in the public affairs\ndomain, using a set of 24 data sources from the Spanish Administration. The\ndatabase comprises 37.9K documents with more than 441K document pages, and more\nthan 8M labels associated to 8 layout block units. The results of our\nexperiments validate the proposed text labeling procedure with accuracy up to\n99%.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Marcos Grande","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.10046v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for\n  Document Analysis"},{"id":"http://arxiv.org/abs/2308.04086v1","updated":"2023-08-08T06:58:05Z","published":"2023-08-08T06:58:05Z","title":"Understanding and Modeling Passive-Negative Feedback for Short-video\n  Sequential Recommendation","summary":"  Sequential recommendation is one of the most important tasks in recommender\nsystems, which aims to recommend the next interacted item with historical\nbehaviors as input. Traditional sequential recommendation always mainly\nconsiders the collected positive feedback such as click, purchase, etc.\nHowever, in short-video platforms such as TikTok, video viewing behavior may\nnot always represent positive feedback. Specifically, the videos are played\nautomatically, and users passively receive the recommended videos. In this new\nscenario, users passively express negative feedback by skipping over videos\nthey do not like, which provides valuable information about their preferences.\nDifferent from the negative feedback studied in traditional recommender\nsystems, this passive-negative feedback can reflect users' interests and serve\nas an important supervision signal in extracting users' preferences. Therefore,\nit is essential to carefully design and utilize it in this novel recommendation\nscenario. In this work, we first conduct analyses based on a large-scale\nreal-world short-video behavior dataset and illustrate the significance of\nleveraging passive feedback. We then propose a novel method that deploys the\nsub-interest encoder, which incorporates positive feedback and passive-negative\nfeedback as supervision signals to learn the user's current active\nsub-interest. Moreover, we introduce an adaptive fusion layer to integrate\nvarious sub-interests effectively. To enhance the robustness of our model, we\nthen introduce a multi-task learning module to simultaneously optimize two\nkinds of feedback -- passive-negative feedback and traditional randomly-sampled\nnegative feedback. The experiments on two large-scale datasets verify that the\nproposed method can significantly outperform state-of-the-art approaches. The\ncode is released at https://github.com/tsinghua-fib-lab/RecSys2023-SINE.\n","authors":["Yunzhu Pan","Chen Gao","Jianxin Chang","Yanan Niu","Yang Song","Kun Gai","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.04086v1.pdf","comment":"Accepted by RecSys'23"},{"id":"http://arxiv.org/abs/2206.12893v3","updated":"2023-08-08T06:40:15Z","published":"2022-06-26T14:51:18Z","title":"PCDF: A Parallel-Computing Distributed Framework for Sponsored Search\n  Advertising Serving","summary":"  Traditional online advertising systems for sponsored search follow a cascade\nparadigm with retrieval, pre-ranking,ranking, respectively. Constrained by\nstrict requirements on online inference efficiency, it tend to be difficult to\ndeploy useful but computationally intensive modules in the ranking stage.\nMoreover, ranking models currently used in the industry assume the user click\nonly relies on the advertisements itself, which results in the ranking stage\noverlooking the impact of organic search results on the predicted\nadvertisements (ads). In this work, we propose a novel framework\nPCDF(Parallel-Computing Distributed Framework), allowing to split the\ncomputation cost into three parts and to deploy them in the pre-module in\nparallel with the retrieval stage, the middle-module for ranking ads, and the\npost-module for re-ranking ads with external items. Our PCDF effectively\nreduces the overall inference latency compared with the classic framework. The\nwhole module is end-to-end offline training and adapt for the online learning\nparadigm. To our knowledge, we are the first to propose an end-to-end solution\nfor online training and deployment on complex CTR models from the system\nframework side.\n","authors":["Han Xu","Hao Qi","Kunyao Wang","Pei Wang","Guowei Zhang","Congcong Liu","Junsheng Jin","Xiwei Zhao","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2206.12893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04067v1","updated":"2023-08-08T06:04:17Z","published":"2023-08-08T06:04:17Z","title":"Online Distillation-enhanced Multi-modal Transformer for Sequential\n  Recommendation","summary":"  Multi-modal recommendation systems, which integrate diverse types of\ninformation, have gained widespread attention in recent years. However,\ncompared to traditional collaborative filtering-based multi-modal\nrecommendation systems, research on multi-modal sequential recommendation is\nstill in its nascent stages. Unlike traditional sequential recommendation\nmodels that solely rely on item identifier (ID) information and focus on\nnetwork structure design, multi-modal recommendation models need to emphasize\nitem representation learning and the fusion of heterogeneous data sources. This\npaper investigates the impact of item representation learning on downstream\nrecommendation tasks and examines the disparities in information fusion at\ndifferent stages. Empirical experiments are conducted to demonstrate the need\nto design a framework suitable for collaborative learning and fusion of diverse\ninformation. Based on this, we propose a new model-agnostic framework for\nmulti-modal sequential recommendation tasks, called Online\nDistillation-enhanced Multi-modal Transformer (ODMT), to enhance feature\ninteraction and mutual learning among multi-source input (ID, text, and image),\nwhile avoiding conflicts among different features during training, thereby\nimproving recommendation accuracy. To be specific, we first introduce an\nID-aware Multi-modal Transformer module in the item representation learning\nstage to facilitate information interaction among different features. Secondly,\nwe employ an online distillation training strategy in the prediction\noptimization stage to make multi-source data learn from each other and improve\nprediction robustness. Experimental results on a video content recommendation\ndataset and three e-commerce recommendation datasets demonstrate the\neffectiveness of the proposed two modules, which is approximately 10%\nimprovement in performance compared to baseline models.\n","authors":["Wei Ji","Xiangyan Liu","An Zhang","Yinwei Wei","Yongxin Ni","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04067v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.04033v1","updated":"2023-08-08T04:21:14Z","published":"2023-08-08T04:21:14Z","title":"Adapting Foundation Models for Information Synthesis of Wireless\n  Communication Specifications","summary":"  Existing approaches to understanding, developing and researching modern\nwireless communication technologies involves time-intensive and arduous process\nof sifting through numerous webpages and technical specification documents,\ngathering the required information and synthesizing it. This paper presents\nNextGen Communications Copilot, a conversational artificial intelligence tool\nfor information synthesis of wireless communication specifications. The system\nbuilds on top of recent advancements in foundation models and consists of three\nkey additional components: a domain-specific database, a context extractor, and\na feedback mechanism. The system appends user queries with concise and\nquery-dependent contextual information extracted from a database of wireless\ntechnical specifications and incorporates tools for expert feedback and data\ncontributions. On evaluation using a benchmark dataset of queries and reference\nresponses created by subject matter experts, the system demonstrated more\nrelevant and accurate answers with an average BLEU score and BERTScore\nF1-measure of 0.37 and 0.79 respectively compared to the corresponding values\nof 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.\n","authors":["Manikanta Kotaru"],"pdf_url":"https://arxiv.org/pdf/2308.04033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2308.04019v1","updated":"2023-08-08T03:33:15Z","published":"2023-08-08T03:33:15Z","title":"Exploring the Spatiotemporal Features of Online Food Recommendation\n  Service","summary":"  Online Food Recommendation Service (OFRS) has remarkable spatiotemporal\ncharacteristics and the advantage of being able to conveniently satisfy users'\nneeds in a timely manner. There have been a variety of studies that have begun\nto explore its spatiotemporal properties, but a comprehensive and in-depth\nanalysis of the OFRS spatiotemporal features is yet to be conducted. Therefore,\nthis paper studies the OFRS based on three questions: how spatiotemporal\nfeatures play a role; why self-attention cannot be used to model the\nspatiotemporal sequences of OFRS; and how to combine spatiotemporal features to\nimprove the efficiency of OFRS. Firstly, through experimental analysis, we\nsystemically extracted the spatiotemporal features of OFRS, identified the most\nvaluable features and designed an effective combination method. Secondly, we\nconducted a detailed analysis of the spatiotemporal sequences, which revealed\nthe shortcomings of self-attention in OFRS, and proposed a more optimized\nspatiotemporal sequence method for replacing self-attention. In addition, we\nalso designed a Dynamic Context Adaptation Model to further improve the\nefficiency and performance of OFRS. Through the offline experiments on two\nlarge datasets and online experiments for a week, the feasibility and\nsuperiority of our model were proven.\n","authors":["Shaochuan Lin","Jiayan Pei","Taotao Zhou","Hengxu He","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.04019v1.pdf","comment":"accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2308.04017v1","updated":"2023-08-08T03:24:44Z","published":"2023-08-08T03:24:44Z","title":"Multi-Granularity Attention Model for Group Recommendation","summary":"  Group recommendation provides personalized recommendations to a group of\nusers based on their shared interests, preferences, and characteristics.\nCurrent studies have explored different methods for integrating individual\npreferences and making collective decisions that benefit the group as a whole.\nHowever, most of them heavily rely on users with rich behavior and ignore\nlatent preferences of users with relatively sparse behavior, leading to\ninsufficient learning of individual interests. To address this challenge, we\npresent the Multi-Granularity Attention Model (MGAM), a novel approach that\nutilizes multiple levels of granularity (i.e., subsets, groups, and supersets)\nto uncover group members' latent preferences and mitigate recommendation noise.\nSpecially, we propose a Subset Preference Extraction module that enhances the\nrepresentation of users' latent subset-level preferences by incorporating their\nprevious interactions with items and utilizing a hierarchical mechanism.\nAdditionally, our method introduces a Group Preference Extraction module and a\nSuperset Preference Extraction module, which explore users' latent preferences\non two levels: the group-level, which maintains users' original preferences,\nand the superset-level, which includes group-group exterior information. By\nincorporating the subset-level embedding, group-level embedding, and\nsuperset-level embedding, our proposed method effectively reduces group\nrecommendation noise across multiple granularities and comprehensively learns\nindividual interests. Extensive offline and online experiments have\ndemonstrated the superiority of our method in terms of performance.\n","authors":["Jianye Ji","Jiayan Pei","Shaochuan Lin","Taotao Zhou","Hengxu He","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.04017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04579v1","updated":"2023-08-08T20:54:59Z","published":"2023-08-08T20:54:59Z","title":"RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose\n  Recommendation System?","summary":"  Over the past two decades, recommendation systems (RSs) have used machine\nlearning (ML) solutions to recommend items, e.g., movies, books, and\nrestaurants, to clients of a business or an online platform. Recipe\nrecommendation, however, has not yet received much attention compared to those\napplications. We introduce RECipe as a multi-purpose recipe recommendation\nframework with a multi-modal knowledge graph (MMKG) backbone. The motivation\nbehind RECipe is to go beyond (deep) neural collaborative filtering (NCF) by\nrecommending recipes to users when they query in natural language or by\nproviding an image. RECipe consists of 3 subsystems: (1) behavior-based\nrecommender, (2) review-based recommender, and (3) image-based recommender.\nEach subsystem relies on the embedding representations of entities and\nrelations in the graph. We first obtain (pre-trained) embedding representations\nof textual entities, such as reviews or ingredients, from a fine-tuned model of\nMicrosoft's MPNet. We initialize the weights of the entities with these\nembeddings to train our knowledge graph embedding (KGE) model. For the visual\ncomponent, i.e., recipe images, we develop a KGE-Guided variational autoencoder\n(KG-VAE) to learn the distribution of images and their latent representations.\nOnce KGE and KG-VAE models are fully trained, we use them as a multi-purpose\nrecommendation framework. For benchmarking, we created two knowledge graphs\n(KGs) from public datasets on Kaggle for recipe recommendation. Our experiments\nshow that the KGE models have comparable performance to the neural solutions.\nWe also present pre-trained NLP embeddings to address important applications\nsuch as zero-shot inference for new users (or the cold start problem) and\nconditional recommendation with respect to recipe categories. We eventually\ndemonstrate the application of RECipe in a multi-purpose recommendation\nsetting.\n","authors":["Ali Pesaranghader","Touqir Sajed"],"pdf_url":"https://arxiv.org/pdf/2308.04579v1.pdf","comment":"19 pages, 8 figures, 8 tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.04431v1","updated":"2023-08-08T17:58:45Z","published":"2023-08-08T17:58:45Z","title":"When More is Less: Incorporating Additional Datasets Can Hurt\n  Performance By Introducing Spurious Correlations","summary":"  In machine learning, incorporating more data is often seen as a reliable\nstrategy for improving model performance; this work challenges that notion by\ndemonstrating that the addition of external datasets in many cases can hurt the\nresulting model's performance. In a large-scale empirical study across\ncombinations of four different open-source chest x-ray datasets and 9 different\nlabels, we demonstrate that in 43% of settings, a model trained on data from\ntwo hospitals has poorer worst group accuracy over both hospitals than a model\ntrained on just a single hospital's data. This surprising result occurs even\nthough the added hospital makes the training distribution more similar to the\ntest distribution. We explain that this phenomenon arises from the spurious\ncorrelation that emerges between the disease and hospital, due to\nhospital-specific image artifacts. We highlight the trade-off one encounters\nwhen training on multiple datasets, between the obvious benefit of additional\ndata and insidious cost of the introduced spurious correlation. In some cases,\nbalancing the dataset can remove the spurious correlation and improve\nperformance, but it is not always an effective strategy. We contextualize our\nresults within the literature on spurious correlations to help explain these\noutcomes. Our experiments underscore the importance of exercising caution when\nselecting training data for machine learning models, especially in settings\nwhere there is a risk of spurious correlations such as with medical imaging.\nThe risks outlined highlight the need for careful data selection and model\nevaluation in future research and practice.\n","authors":["Rhys Compton","Lily Zhang","Aahlad Puli","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2308.04431v1.pdf","comment":"Accepted at MLHC 2023"},{"id":"http://arxiv.org/abs/2308.04430v1","updated":"2023-08-08T17:58:15Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":"  The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v1.pdf","comment":"27 pages; 6 figures. Code, models, and data available at\n  https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2308.04428v1","updated":"2023-08-08T17:56:20Z","published":"2023-08-08T17:56:20Z","title":"Meta-Learning Operators to Optimality from Multi-Task Non-IID Data","summary":"  A powerful concept behind much of the recent progress in machine learning is\nthe extraction of common features across data from heterogeneous sources or\ntasks. Intuitively, using all of one's data to learn a common representation\nfunction benefits both computational effort and statistical generalization by\nleaving a smaller number of parameters to fine-tune on a given task. Toward\ntheoretically grounding these merits, we propose a general setting of\nrecovering linear operators $M$ from noisy vector measurements $y = Mx + w$,\nwhere the covariates $x$ may be both non-i.i.d. and non-isotropic. We\ndemonstrate that existing isotropy-agnostic meta-learning approaches incur\nbiases on the representation update, which causes the scaling of the noise\nterms to lose favorable dependence on the number of source tasks. This in turn\ncan cause the sample complexity of representation learning to be bottlenecked\nby the single-task data size. We introduce an adaptation, $\\texttt{De-bias &\nFeature-Whiten}$ ($\\texttt{DFW}$), of the popular alternating\nminimization-descent (AMD) scheme proposed in Collins et al., (2021), and\nestablish linear convergence to the optimal representation with noise level\nscaling down with the $\\textit{total}$ source data size. This leads to\ngeneralization bounds on the same order as an oracle empirical risk minimizer.\nWe verify the vital importance of $\\texttt{DFW}$ on various numerical\nsimulations. In particular, we show that vanilla alternating-minimization\ndescent fails catastrophically even for iid, but mildly non-isotropic data. Our\nanalysis unifies and generalizes prior work, and provides a flexible framework\nfor a wider range of applications, such as in controls and dynamical systems.\n","authors":["Thomas T. C. K. Zhang","Leonardo F. Toso","James Anderson","Nikolai Matni"],"pdf_url":"https://arxiv.org/pdf/2308.04428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04426v1","updated":"2023-08-08T17:55:30Z","published":"2023-08-08T17:55:30Z","title":"A Deep-Learning Method Using Auto-encoder and Generative Adversarial\n  Network for Anomaly Detection on Ancient Stone Stele Surfaces","summary":"  Accurate detection of natural deterioration and man-made damage on the\nsurfaces of ancient stele in the first instance is essential for their\npreventive conservation. Existing methods for cultural heritage preservation\nare not able to achieve this goal perfectly due to the difficulty of balancing\naccuracy, efficiency, timeliness, and cost. This paper presents a deep-learning\nmethod to automatically detect above mentioned emergencies on ancient stone\nstele in real time, employing autoencoder (AE) and generative adversarial\nnetwork (GAN). The proposed method overcomes the limitations of existing\nmethods by requiring no extensive anomaly samples while enabling comprehensive\ndetection of unpredictable anomalies. the method includes stages of monitoring,\ndata acquisition, pre-processing, model structuring, and post-processing.\nTaking the Longmen Grottoes' stone steles as a case study, an unsupervised\nlearning model based on AE and GAN architectures is proposed and validated with\na reconstruction accuracy of 99.74\\%. The method's evaluation revealed the\nproficient detection of seven artificially designed anomalies and demonstrated\nprecision and reliability without false alarms. This research provides novel\nideas and possibilities for the application of deep learning in the field of\ncultural heritage.\n","authors":["Yikun Liu","Yuning Wang","Cheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07774v3","updated":"2023-08-08T17:48:12Z","published":"2022-09-19T22:57:10Z","title":"Learning To Rank Diversely At Airbnb","summary":"  Airbnb is a two-sided marketplace, bringing together hosts who own listings\nfor rent, with prospective guests from around the globe. Applying neural\nnetwork-based learning to rank techniques has led to significant improvements\nin matching guests with hosts. These improvements in ranking were driven by a\ncore strategy: order the listings by their estimated booking probabilities,\nthen iterate on techniques to make these booking probability estimates more and\nmore accurate. Embedded implicitly in this strategy was an assumption that the\nbooking probability of a listing could be determined independently of other\nlistings in search results. In this paper we discuss how this assumption,\npervasive throughout the commonly-used learning to rank frameworks, is false.\nWe provide a theoretical foundation correcting this assumption, followed by\nefficient neural network architectures based on the theory. Explicitly\naccounting for possible similarities between listings, and reducing them to\ndiversify the search results generated strong positive impact. We discuss these\nmetric wins as part of the online A/B tests of the theory. Our method provides\na practical way to diversify search results for large-scale production ranking\nsystems.\n","authors":["Malay Haldar","Mustafa Abdool","Liwei He","Dillon Davis","Huiji Gao","Sanjeev Katariya"],"pdf_url":"https://arxiv.org/pdf/2210.07774v3.pdf","comment":"Search ranking, Diversity, e-commerce"},{"id":"http://arxiv.org/abs/2308.04417v1","updated":"2023-08-08T17:34:28Z","published":"2023-08-08T17:34:28Z","title":"DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from\n  Optical Satellite Images","summary":"  Optical satellite images are a critical data source; however, cloud cover\noften compromises their quality, hindering image applications and analysis.\nConsequently, effectively removing clouds from optical satellite images has\nemerged as a prominent research direction. While recent advancements in cloud\nremoval primarily rely on generative adversarial networks, which may yield\nsuboptimal image quality, diffusion models have demonstrated remarkable success\nin diverse image-generation tasks, showcasing their potential in addressing\nthis challenge. This paper presents a novel framework called DiffCR, which\nleverages conditional guided diffusion with deep convolutional networks for\nhigh-performance cloud removal for optical satellite imagery. Specifically, we\nintroduce a decoupled encoder for conditional image feature extraction,\nproviding a robust color representation to ensure the close similarity of\nappearance information between the conditional input and the synthesized\noutput. Moreover, we propose a novel and efficient time and condition fusion\nblock within the cloud removal model to accurately simulate the correspondence\nbetween the appearance in the conditional image and the target image at a low\ncomputational cost. Extensive experimental evaluations on two commonly used\nbenchmark datasets demonstrate that DiffCR consistently achieves\nstate-of-the-art performance on all metrics, with parameter and computational\ncomplexities amounting to only 5.1% and 5.4%, respectively, of those previous\nbest methods. The source code, pre-trained models, and all the experimental\nresults will be publicly available at https://github.com/XavierJiezou/DiffCR\nupon the paper's acceptance of this work.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Yu Zhang","Shiying Wang","Lei Jin","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04417v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.09345v2","updated":"2023-08-08T17:26:58Z","published":"2023-06-15T17:59:51Z","title":"Evaluating Data Attribution for Text-to-Image Models","summary":"  While large text-to-image models are able to synthesize \"novel\" images, these\nimages are necessarily a reflection of the training data. The problem of data\nattribution in such models -- which of the images in the training set are most\nresponsible for the appearance of a given generated image -- is a difficult yet\nimportant one. As an initial step toward this problem, we evaluate attribution\nthrough \"customization\" methods, which tune an existing large-scale model\ntoward a given exemplar object or style. Our key insight is that this allows us\nto efficiently create synthetic images that are computationally influenced by\nthe exemplar by construction. With our new dataset of such exemplar-influenced\nimages, we are able to evaluate various data attribution algorithms and\ndifferent possible feature spaces. Furthermore, by training on our dataset, we\ncan tune standard models, such as DINO, CLIP, and ViT, toward the attribution\nproblem. Even though the procedure is tuned towards small exemplar sets, we\nshow generalization to larger sets. Finally, by taking into account the\ninherent uncertainty of the problem, we can assign soft attribution scores over\na set of training images.\n","authors":["Sheng-Yu Wang","Alexei A. Efros","Jun-Yan Zhu","Richard Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09345v2.pdf","comment":"Updated v2 -- ICCV 2023 camera ready version. Project page:\n  https://peterwang512.github.io/GenDataAttribution Code:\n  https://github.com/PeterWang512/GenDataAttribution"},{"id":"http://arxiv.org/abs/2308.04412v1","updated":"2023-08-08T17:18:04Z","published":"2023-08-08T17:18:04Z","title":"Probabilistic Invariant Learning with Randomized Linear Classifiers","summary":"  Designing models that are both expressive and preserve known invariances of\ntasks is an increasingly hard problem. Existing solutions tradeoff invariance\nfor computational or memory resources. In this work, we show how to leverage\nrandomness and design models that are both expressive and invariant but use\nless resources. Inspired by randomized algorithms, our key insight is that\naccepting probabilistic notions of universal approximation and invariance can\nreduce our resource requirements. More specifically, we propose a class of\nbinary classification models called Randomized Linear Classifiers (RLCs). We\ngive parameter and sample size conditions in which RLCs can, with high\nprobability, approximate any (smooth) function while preserving invariance to\ncompact group transformations. Leveraging this result, we design three RLCs\nthat are provably probabilistic invariant for classification tasks over sets,\ngraphs, and spherical data. We show how these models can achieve probabilistic\ninvariance and universality using less resources than (deterministic) neural\nnetworks and their invariant counterparts. Finally, we empirically demonstrate\nthe benefits of this new class of models on invariant tasks where deterministic\ninvariant neural networks are known to struggle.\n","authors":["Leonardo Cotta","Gal Yehuda","Assaf Schuster","Chris J. Maddison"],"pdf_url":"https://arxiv.org/pdf/2308.04412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04406v1","updated":"2023-08-08T17:10:23Z","published":"2023-08-08T17:10:23Z","title":"XGBD: Explanation-Guided Graph Backdoor Detection","summary":"  Backdoor attacks pose a significant security risk to graph learning models.\nBackdoors can be embedded into the target model by inserting backdoor triggers\ninto the training dataset, causing the model to make incorrect predictions when\nthe trigger is present. To counter backdoor attacks, backdoor detection has\nbeen proposed. An emerging detection strategy in the vision and NLP domains is\nbased on an intriguing phenomenon: when training models on a mixture of\nbackdoor and clean samples, the loss on backdoor samples drops significantly\nfaster than on clean samples, allowing backdoor samples to be easily detected\nby selecting samples with the lowest loss values. However, the ignorance of\ntopological feature information on graph data limits its detection\neffectiveness when applied directly to the graph domain. To this end, we\npropose an explanation-guided backdoor detection method to take advantage of\nthe topological information. Specifically, we train a helper model on the graph\ndataset, feed graph samples into the model, and then adopt explanation methods\nto attribute model prediction to an important subgraph. We observe that\nbackdoor samples have distinct attribution distribution than clean samples, so\nthe explanatory subgraph could serve as more discriminative features for\ndetecting backdoor samples. Comprehensive experiments on multiple popular\ndatasets and attack methods demonstrate the effectiveness and explainability of\nour method. Our code is available:\nhttps://github.com/GuanZihan/GNN_backdoor_detection.\n","authors":["Zihan Guan","Mengnan Du","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04406v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.04396v1","updated":"2023-08-08T17:00:30Z","published":"2023-08-08T17:00:30Z","title":"Event Abstraction for Enterprise Collaboration Systems to Support Social\n  Process Mining","summary":"  One aim of Process Mining (PM) is the discovery of process models from event\nlogs of information systems. PM has been successfully applied to\nprocess-oriented enterprise systems but is less suited for communication- and\ndocument-oriented Enterprise Collaboration Systems (ECS). ECS event logs are\nvery fine-granular and PM applied to their logs results in spaghetti models. A\ncommon solution for this is event abstraction, i.e., converting low-level logs\ninto more abstract high-level logs before running discovery algorithms. ECS\nlogs come with special characteristics that have so far not been fully\naddressed by existing event abstraction approaches. We aim to close this gap\nwith a tailored ECS event abstraction (ECSEA) approach that trains a model by\ncomparing recorded actual user activities (high-level traces) with the\nsystem-generated low-level traces (extracted from the ECS). The model allows us\nto automatically convert future low-level traces into an abstracted high-level\nlog that can be used for PM. Our evaluation shows that the algorithm produces\naccurate results. ECSEA is a preprocessing method that is essential for the\ninterpretation of collaborative work activity in ECS, which we call Social\nProcess Mining.\n","authors":["Jonas Blatt","Patrick Delfmann","Petra Schubert"],"pdf_url":"https://arxiv.org/pdf/2308.04396v1.pdf","comment":"8 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2308.04395v1","updated":"2023-08-08T17:00:11Z","published":"2023-08-08T17:00:11Z","title":"Data Augmentation-Based Unsupervised Domain Adaptation In Medical\n  Imaging","summary":"  Deep learning-based models in medical imaging often struggle to generalize\neffectively to new scans due to data heterogeneity arising from differences in\nhardware, acquisition parameters, population, and artifacts. This limitation\npresents a significant challenge in adopting machine learning models for\nclinical practice. We propose an unsupervised method for robust domain\nadaptation in brain MRI segmentation by leveraging MRI-specific augmentation\ntechniques. To evaluate the effectiveness of our method, we conduct extensive\nexperiments across diverse datasets, modalities, and segmentation tasks,\ncomparing against the state-of-the-art methods. The results show that our\nproposed approach achieves high accuracy, exhibits broad applicability, and\nshowcases remarkable robustness against domain shift in various tasks,\nsurpassing the state-of-the-art performance in the majority of cases.\n","authors":["Sebastian Nørgaard Llambias","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2308.04395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04375v1","updated":"2023-08-08T16:23:46Z","published":"2023-08-08T16:23:46Z","title":"Understanding the Effect of Counterfactual Explanations on Trust and\n  Reliance on AI for Human-AI Collaborative Clinical Decision Making","summary":"  Artificial intelligence (AI) is increasingly being considered to assist human\ndecision-making in high-stake domains (e.g. health). However, researchers have\ndiscussed an issue that humans can over-rely on wrong suggestions of the AI\nmodel instead of achieving human AI complementary performance. In this work, we\nutilized salient feature explanations along with what-if, counterfactual\nexplanations to make humans review AI suggestions more analytically to reduce\noverreliance on AI and explored the effect of these explanations on trust and\nreliance on AI during clinical decision-making. We conducted an experiment with\nseven therapists and ten laypersons on the task of assessing post-stroke\nsurvivors' quality of motion, and analyzed their performance, agreement level\non the task, and reliance on AI without and with two types of AI explanations.\nOur results showed that the AI model with both salient features and\ncounterfactual explanations assisted therapists and laypersons to improve their\nperformance and agreement level on the task when `right' AI outputs are\npresented. While both therapists and laypersons over-relied on `wrong' AI\noutputs, counterfactual explanations assisted both therapists and laypersons to\nreduce their over-reliance on `wrong' AI outputs by 21\\% compared to salient\nfeature explanations. Specifically, laypersons had higher performance degrades\nby 18.0 f1-score with salient feature explanations and 14.0 f1-score with\ncounterfactual explanations than therapists with performance degrades of 8.6\nand 2.8 f1-scores respectively. Our work discusses the potential of\ncounterfactual explanations to better estimate the accuracy of an AI model and\nreduce over-reliance on `wrong' AI outputs and implications for improving\nhuman-AI collaborative decision-making.\n","authors":["Min Hun Lee","Chong Jun Chew"],"pdf_url":"https://arxiv.org/pdf/2308.04375v1.pdf","comment":"ACM CSCW 2023"},{"id":"http://arxiv.org/abs/2308.04373v1","updated":"2023-08-08T16:22:44Z","published":"2023-08-08T16:22:44Z","title":"Pelta: Shielding Transformers to Mitigate Evasion Attacks in Federated\n  Learning","summary":"  The main premise of federated learning is that machine learning model updates\nare computed locally, in particular to preserve user data privacy, as those\nnever leave the perimeter of their device. This mechanism supposes the general\nmodel, once aggregated, to be broadcast to collaborating and non malicious\nnodes. However, without proper defenses, compromised clients can easily probe\nthe model inside their local memory in search of adversarial examples. For\ninstance, considering image-based applications, adversarial examples consist of\nimperceptibly perturbed images (to the human eye) misclassified by the local\nmodel, which can be later presented to a victim node's counterpart model to\nreplicate the attack. To mitigate such malicious probing, we introduce Pelta, a\nnovel shielding mechanism leveraging trusted hardware. By harnessing the\ncapabilities of Trusted Execution Environments (TEEs), Pelta masks part of the\nback-propagation chain rule, otherwise typically exploited by attackers for the\ndesign of malicious samples. We evaluate Pelta on a state of the art ensemble\nmodel and demonstrate its effectiveness against the Self Attention Gradient\nadversarial Attack.\n","authors":["Simon Queyrut","Yérom-David Bromberg","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2308.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06713v2","updated":"2023-08-08T16:21:49Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne"],"pdf_url":"https://arxiv.org/pdf/2307.06713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03735v2","updated":"2023-08-08T16:20:18Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v2.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2305.19259v3","updated":"2023-08-08T16:05:55Z","published":"2023-05-30T17:47:27Z","title":"Shuffle SGD is Always Better than SGD: Improved Analysis of SGD with\n  Arbitrary Data Orders","summary":"  Stochastic Gradient Descent (SGD) algorithms are widely used in optimizing\nneural networks, with Random Reshuffling (RR) and Single Shuffle (SS) being\npopular choices for cycling through random or single permutations of the\ntraining data. However, the convergence properties of these algorithms in the\nnon-convex case are not fully understood. Existing results suggest that, in\nrealistic training scenarios where the number of epochs is smaller than the\ntraining set size, RR may perform worse than SGD.\n  In this paper, we analyze a general SGD algorithm that allows for arbitrary\ndata orderings and show improved convergence rates for non-convex functions.\nSpecifically, our analysis reveals that SGD with random and single shuffling is\nalways faster or at least as good as classical SGD with replacement, regardless\nof the number of iterations. Overall, our study highlights the benefits of\nusing SGD with random/single shuffling and provides new insights into its\nconvergence properties for non-convex optimization.\n","authors":["Anastasia Koloskova","Nikita Doikov","Sebastian U. Stich","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.19259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03571v2","updated":"2023-08-08T16:05:01Z","published":"2023-07-07T13:06:12Z","title":"Smoothing the Edges: A General Framework for Smooth Optimization in\n  Sparse Regularization using Hadamard Overparametrization","summary":"  This paper presents a framework for smooth optimization of objectives with\n$\\ell_q$ and $\\ell_{p,q}$ regularization for (structured) sparsity. Finding\nsolutions to these non-smooth and possibly non-convex problems typically relies\non specialized optimization routines. In contrast, the method studied here is\ncompatible with off-the-shelf (stochastic) gradient descent that is ubiquitous\nin deep learning, thereby enabling differentiable sparse regularization without\napproximations. The proposed optimization transfer comprises an\noverparametrization of selected model parameters followed by a change of\npenalties. In the overparametrized problem, smooth and convex $\\ell_2$\nregularization induces non-smooth and non-convex regularization in the original\nparametrization. We show that the resulting surrogate problem not only has an\nidentical global optimum but also exactly preserves the local minima. This is\nparticularly useful in non-convex regularization, where finding global\nsolutions is NP-hard and local minima often generalize well. We provide an\nintegrative overview that consolidates various literature strands on\nsparsity-inducing parametrizations in a general setting and meaningfully extend\nexisting approaches. The feasibility of our approach is evaluated through\nnumerical experiments, demonstrating its effectiveness by matching or\noutperforming common implementations of convex and non-convex regularizers.\n","authors":["Chris Kolb","Christian L. Müller","Bernd Bischl","David Rügamer"],"pdf_url":"https://arxiv.org/pdf/2307.03571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v1","updated":"2023-08-08T16:04:42Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n  Learner Equation Modeling","summary":"  Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16565v2","updated":"2023-08-08T16:01:41Z","published":"2023-03-29T09:47:48Z","title":"PMAA: A Progressive Multi-scale Attention Autoencoder Model for\n  High-performance Cloud Removal from Multi-temporal Satellite Imagery","summary":"  Satellite imagery analysis plays a pivotal role in remote sensing; however,\ninformation loss due to cloud cover significantly impedes its application.\nAlthough existing deep cloud removal models have achieved notable outcomes,\nthey scarcely consider contextual information. This study introduces a\nhigh-performance cloud removal architecture, termed Progressive Multi-scale\nAttention Autoencoder (PMAA), which concurrently harnesses global and local\ninformation to construct robust contextual dependencies using a novel\nMulti-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).\nPMAA establishes long-range dependencies of multi-scale features using MAM and\nmodulates the reconstruction of fine-grained details utilizing LIM, enabling\nsimultaneous representation of fine- and coarse-grained features at the same\nlevel. With the help of diverse and multi-scale features, PMAA consistently\noutperforms the previous state-of-the-art model CTGAN on two benchmark\ndatasets. Moreover, PMAA boasts considerable efficiency advantages, with only\n0.5% and 14.6% of the parameters and computational complexity of CTGAN,\nrespectively. These comprehensive results underscore PMAA's potential as a\nlightweight cloud removal network suitable for deployment on edge devices to\naccomplish large-scale cloud removal tasks. Our source code and pre-trained\nmodels are available at https://github.com/XavierJiezou/PMAA.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Pin Tao","Yachao Cui"],"pdf_url":"https://arxiv.org/pdf/2303.16565v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.04341v1","updated":"2023-08-08T15:38:55Z","published":"2023-08-08T15:38:55Z","title":"Accurate, Explainable, and Private Models: Providing Recourse While\n  Minimizing Training Data Leakage","summary":"  Machine learning models are increasingly utilized across impactful domains to\npredict individual outcomes. As such, many models provide algorithmic recourse\nto individuals who receive negative outcomes. However, recourse can be\nleveraged by adversaries to disclose private information. This work presents\nthe first attempt at mitigating such attacks. We present two novel methods to\ngenerate differentially private recourse: Differentially Private Model (DPM)\nand Laplace Recourse (LR). Using logistic regression classifiers and real world\nand synthetic datasets, we find that DPM and LR perform well in reducing what\nan adversary can infer, especially at low FPR. When training dataset size is\nlarge enough, we find particular success in preventing privacy leakage while\nmaintaining model and recourse accuracy with our novel LR method.\n","authors":["Catherine Huang","Chelse Swoopes","Christina Xiao","Jiaqi Ma","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2308.04341v1.pdf","comment":"Proceedings of The Second Workshop on New Frontiers in Adversarial\n  Machine Learning (AdvML-Frontiers @ ICML 2023)"},{"id":"http://arxiv.org/abs/2308.03629v2","updated":"2023-08-08T15:38:21Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v2.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2305.12522v2","updated":"2023-08-08T15:22:26Z","published":"2023-05-21T17:46:28Z","title":"P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic\n  Segmentation","summary":"  To mitigate the necessity for large amounts of supervised segmentation\nannotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)\nstrategies have been devised. These will often rely on advanced data and model\nregularization strategies to instigate the development of useful properties\n(e.g., prediction completeness and fidelity to semantic boundaries) in\nsegmentation priors, notwithstanding the lack of annotated information. In this\nwork, we first create a strong baseline by analyzing complementary WSSS\ntechniques and regularizing strategies, considering their strengths and\nlimitations. We then propose a new Class-specific Adversarial Erasing strategy,\ncomprising two adversarial CAM generating networks being gradually refined to\nproduce robust semantic segmentation proposals. Empirical results suggest that\nour approach induces substantial improvement in the effectiveness of the\nbaseline, resulting in a noticeable improvement over both Pascal VOC 2012 and\nMS COCO 2014 datasets.\n","authors":["Lucas David","Helio Pedrini","Zanoni Dias"],"pdf_url":"https://arxiv.org/pdf/2305.12522v2.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04332v1","updated":"2023-08-08T15:21:30Z","published":"2023-08-08T15:21:30Z","title":"RLHF-Blender: A Configurable Interactive Interface for Learning from\n  Diverse Human Feedback","summary":"  To use reinforcement learning from human feedback (RLHF) in practical\napplications, it is crucial to learn reward models from diverse sources of\nhuman feedback and to consider human factors involved in providing feedback of\ndifferent types. However, the systematic study of learning from diverse types\nof feedback is held back by limited standardized tooling available to\nresearchers. To bridge this gap, we propose RLHF-Blender, a configurable,\ninteractive interface for learning from human feedback. RLHF-Blender provides a\nmodular experimentation framework and implementation that enables researchers\nto systematically investigate the properties and qualities of human feedback\nfor reward learning. The system facilitates the exploration of various feedback\ntypes, including demonstrations, rankings, comparisons, and natural language\ninstructions, as well as studies considering the impact of human factors on\ntheir effectiveness. We discuss a set of concrete research opportunities\nenabled by RLHF-Blender. More information is available at\nhttps://rlhfblender.info/.\n","authors":["Yannick Metz","David Lindner","Raphaël Baur","Daniel Keim","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2308.04332v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.07873v3","updated":"2023-08-08T15:13:22Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v3.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2302.01075v5","updated":"2023-08-08T15:12:42Z","published":"2023-02-02T13:05:27Z","title":"MonoFlow: Rethinking Divergence GANs via the Perspective of Wasserstein\n  Gradient Flows","summary":"  The conventional understanding of adversarial training in generative\nadversarial networks (GANs) is that the discriminator is trained to estimate a\ndivergence, and the generator learns to minimize this divergence. We argue that\ndespite the fact that many variants of GANs were developed following this\nparadigm, the current theoretical understanding of GANs and their practical\nalgorithms are inconsistent. In this paper, we leverage Wasserstein gradient\nflows which characterize the evolution of particles in the sample space, to\ngain theoretical insights and algorithmic inspiration of GANs. We introduce a\nunified generative modeling framework - MonoFlow: the particle evolution is\nrescaled via a monotonically increasing mapping of the log density ratio. Under\nour framework, adversarial training can be viewed as a procedure first\nobtaining MonoFlow's vector field via training the discriminator and the\ngenerator learns to draw the particle flow defined by the corresponding vector\nfield. We also reveal the fundamental difference between variational divergence\nminimization and adversarial training. This analysis helps us to identify what\ntypes of generator loss functions can lead to the successful training of GANs\nand suggest that GANs may have more loss designs beyond the literature (e.g.,\nnon-saturated loss), as long as they realize MonoFlow. Consistent empirical\nstudies are included to validate the effectiveness of our framework.\n","authors":["Mingxuan Yi","Zhanxing Zhu","Song Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01075v5.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.04314v1","updated":"2023-08-08T15:02:50Z","published":"2023-08-08T15:02:50Z","title":"Cooperative Multi-agent Bandits: Distributed Algorithms with Optimal\n  Individual Regret and Constant Communication Costs","summary":"  Recently, there has been extensive study of cooperative multi-agent\nmulti-armed bandits where a set of distributed agents cooperatively play the\nsame multi-armed bandit game. The goal is to develop bandit algorithms with the\noptimal group and individual regrets and low communication between agents. The\nprior work tackled this problem using two paradigms: leader-follower and fully\ndistributed algorithms. Prior algorithms in both paradigms achieve the optimal\ngroup regret. The leader-follower algorithms achieve constant communication\ncosts but fail to achieve optimal individual regrets. The state-of-the-art\nfully distributed algorithms achieve optimal individual regrets but fail to\nachieve constant communication costs. This paper presents a simple yet\neffective communication policy and integrates it into a learning algorithm for\ncooperative bandits. Our algorithm achieves the best of both paradigms: optimal\nindividual regret and constant communication costs.\n","authors":["Lin Yang","Xuchuang Wang","Mohammad Hajiesmaili","Lijun Zhang","John C. S. Lui","Don Towsley"],"pdf_url":"https://arxiv.org/pdf/2308.04314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v2","updated":"2023-08-08T14:52:39Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2207.07271v3","updated":"2023-08-08T14:51:47Z","published":"2022-07-15T03:37:59Z","title":"Set-based value operators for non-stationary Markovian environments","summary":"  This paper analyzes finite state Markov Decision Processes (MDPs) with\nuncertain parameters in compact sets and re-examines results from robust MDP\nvia set-based fixed point theory. To this end, we generalize the Bellman and\npolicy evaluation operators to contracting operators on the value function\nspace and denote them as \\emph{value operators}. We lift these value operators\nto act on \\emph{sets} of value functions and denote them as \\emph{set-based\nvalue operators}. We prove that the set-based value operators are\n\\emph{contractions} in the space of compact value function sets. Leveraging\ninsights from set theory, we generalize the rectangularity condition in classic\nrobust MDP literature to a containment condition for all value operators, which\nis weaker and can be applied to a larger set of parameter-uncertain MDPs and\ncontracting operators in dynamic programming. We prove that both the\nrectangularity condition and the containment condition sufficiently ensure that\nthe set-based value operator's fixed point set contains its own extrema\nelements. For convex and compact sets of uncertain MDP parameters, we show\nequivalence between the classic robust value function and the supremum of the\nfixed point set of the set-based Bellman operator. Under dynamically changing\nMDP parameters in compact sets, we prove a set convergence result for value\niteration, which otherwise may not converge to a single value function.\nFinally, we derive novel guarantees for probabilistic path-planning problems in\nplanet exploration and stratospheric station-keeping.\n","authors":["Sarah H. Q. Li","Assalé Adjé","Pierre-Loïc Garoche","Behçet Açıkmeşe"],"pdf_url":"https://arxiv.org/pdf/2207.07271v3.pdf","comment":"17 pages, 11 figures, 1 table"},{"id":"http://arxiv.org/abs/2303.00500v2","updated":"2023-08-08T14:50:50Z","published":"2023-03-01T13:32:55Z","title":"Inherently Interpretable Multi-Label Classification Using Class-Specific\n  Counterfactuals","summary":"  Interpretability is essential for machine learning algorithms in high-stakes\napplication fields such as medical image analysis. However, high-performing\nblack-box neural networks do not provide explanations for their predictions,\nwhich can lead to mistrust and suboptimal human-ML collaboration. Post-hoc\nexplanation techniques, which are widely used in practice, have been shown to\nsuffer from severe conceptual problems. Furthermore, as we show in this paper,\ncurrent explanation techniques do not perform adequately in the multi-label\nscenario, in which multiple medical findings may co-occur in a single image. We\npropose Attri-Net, an inherently interpretable model for multi-label\nclassification. Attri-Net is a powerful classifier that provides transparent,\ntrustworthy, and human-understandable explanations. The model first generates\nclass-specific attribution maps based on counterfactuals to identify which\nimage regions correspond to certain medical findings. Then a simple logistic\nregression classifier is used to make predictions based solely on these\nattribution maps. We compare Attri-Net to five post-hoc explanation techniques\nand one inherently interpretable classifier on three chest X-ray datasets. We\nfind that Attri-Net produces high-quality multi-label explanations consistent\nwith clinical knowledge and has comparable classification performance to\nstate-of-the-art classification models.\n","authors":["Susu Sun","Stefano Woerner","Andreas Maier","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2303.00500v2.pdf","comment":"Accepted to MIDL 2023"},{"id":"http://arxiv.org/abs/2308.04304v1","updated":"2023-08-08T14:50:05Z","published":"2023-08-08T14:50:05Z","title":"The Model Inversion Eavesdropping Attack in Semantic Communication\n  Systems","summary":"  In recent years, semantic communication has been a popular research topic for\nits superiority in communication efficiency. As semantic communication relies\non deep learning to extract meaning from raw messages, it is vulnerable to\nattacks targeting deep learning models. In this paper, we introduce the model\ninversion eavesdropping attack (MIEA) to reveal the risk of privacy leaks in\nthe semantic communication system. In MIEA, the attacker first eavesdrops the\nsignal being transmitted by the semantic communication system and then performs\nmodel inversion attack to reconstruct the raw message, where both the white-box\nand black-box settings are considered. Evaluation results show that MIEA can\nsuccessfully reconstruct the raw message with good quality under different\nchannel conditions. We then propose a defense method based on random\npermutation and substitution to defend against MIEA in order to achieve secure\nsemantic communication. Our experimental results demonstrate the effectiveness\nof the proposed defense method in preventing MIEA.\n","authors":["Yuhao Chen","Qianqian Yang","Zhiguo Shi","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2308.04304v1.pdf","comment":"Accepted by 2023 IEEE Global Communications Conference (GLOBECOM)"},{"id":"http://arxiv.org/abs/2105.02796v2","updated":"2023-08-08T14:34:33Z","published":"2021-05-06T16:41:04Z","title":"Practical and Rigorous Uncertainty Bounds for Gaussian Process\n  Regression","summary":"  Gaussian Process Regression is a popular nonparametric regression method\nbased on Bayesian principles that provides uncertainty estimates for its\npredictions. However, these estimates are of a Bayesian nature, whereas for\nsome important applications, like learning-based control with safety\nguarantees, frequentist uncertainty bounds are required. Although such rigorous\nbounds are available for Gaussian Processes, they are too conservative to be\nuseful in applications. This often leads practitioners to replacing these\nbounds by heuristics, thus breaking all theoretical guarantees. To address this\nproblem, we introduce new uncertainty bounds that are rigorous, yet practically\nuseful at the same time. In particular, the bounds can be explicitly evaluated\nand are much less conservative than state of the art results. Furthermore, we\nshow that certain model misspecifications lead to only graceful degradation. We\ndemonstrate these advantages and the usefulness of our results for\nlearning-based control with numerical examples.\n","authors":["Christian Fiedler","Carsten W. Scherer","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2105.02796v2.pdf","comment":"Contains supplementary material and corrections to the original\n  version"},{"id":"http://arxiv.org/abs/2212.04780v3","updated":"2023-08-08T14:30:05Z","published":"2022-12-09T11:18:40Z","title":"Genie: Show Me the Data for Quantization","summary":"  Zero-shot quantization is a promising approach for developing lightweight\ndeep neural networks when data is inaccessible owing to various reasons,\nincluding cost and issues related to privacy. By exploiting the learned\nparameters ($\\mu$ and $\\sigma$) of batch normalization layers in an\nFP32-pre-trained model, zero-shot quantization schemes focus on generating\nsynthetic data. Subsequently, they distill knowledge from the pre-trained model\n(teacher) to the quantized model (student) such that the quantized model can be\noptimized with the synthetic dataset. However, thus far, zero-shot quantization\nhas primarily been discussed in the context of quantization-aware training\nmethods, which require task-specific losses and long-term optimization as much\nas retraining. We thus introduce a post-training quantization scheme for\nzero-shot quantization that produces high-quality quantized networks within a\nfew hours. Furthermore, we propose a framework called Genie~that generates data\nsuited for quantization. With the data synthesized by Genie, we can produce\nrobust quantized models without real datasets, which is comparable to few-shot\nquantization. We also propose a post-training quantization algorithm to enhance\nthe performance of quantized models. By combining them, we can bridge the gap\nbetween zero-shot and few-shot quantization while significantly improving the\nquantization performance compared to that of existing approaches. In other\nwords, we can obtain a unique state-of-the-art zero-shot quantization approach.\nThe code is available at \\url{https://github.com/SamsungLabs/Genie}.\n","authors":["Yongkweon Jeon","Chungman Lee","Ho-young Kim"],"pdf_url":"https://arxiv.org/pdf/2212.04780v3.pdf","comment":"Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie"},{"id":"http://arxiv.org/abs/2308.04286v1","updated":"2023-08-08T14:29:35Z","published":"2023-08-08T14:29:35Z","title":"Comparative Analysis of the wav2vec 2.0 Feature Extractor","summary":"  Automatic speech recognition (ASR) systems typically use handcrafted feature\nextraction pipelines. To avoid their inherent information loss and to achieve\nmore consistent modeling from speech to transcribed text, neural raw waveform\nfeature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,\nwhich has recently gained large popularity, uses a convolutional FE which\noperates directly on the speech waveform. However, it is not yet studied\nextensively in the literature. In this work, we study its capability to replace\nthe standard feature extraction methods in a connectionist temporal\nclassification (CTC) ASR model and compare it to an alternative neural FE. We\nshow that both are competitive with traditional FEs on the LibriSpeech\nbenchmark and analyze the effect of the individual components. Furthermore, we\nanalyze the learned filters and show that the most important information for\nthe ASR system is obtained by a set of bandpass filters.\n","authors":["Peter Vieting","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2308.04286v1.pdf","comment":"Accepted at ITG 2023"},{"id":"http://arxiv.org/abs/2308.04275v1","updated":"2023-08-08T14:17:17Z","published":"2023-08-08T14:17:17Z","title":"In-Context Alignment: Chat with Vanilla Language Models Before\n  Fine-Tuning","summary":"  In this note, we explore inference-time alignment through in-context\nlearning. We consider a vanilla pretrained language model Llama-2 before any\nfine-tuning and retrieve an average of 9 demonstration alignment examples when\nthe model is prompted to follow chat-style instructions. Compared to direct\nprompting, the in-context alignment without changing model weights leads to a\n7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making\nthe vanilla language model comparable to strong baselines with alignment\nfine-tuning.\n","authors":["Xiaochuang Han"],"pdf_url":"https://arxiv.org/pdf/2308.04275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.08130v2","updated":"2023-08-08T14:11:40Z","published":"2021-01-19T16:14:02Z","title":"Machine learning for rapid discovery of laminar flow channel wall\n  modifications that enhance heat transfer","summary":"  Numerical simulation of fluids plays an essential role in modeling many\nphysical phenomena, which enables technological advancements, contributes to\nsustainable practices, and expands our understanding of various natural and\nengineered systems. The calculation of heat transfer in fluid flow in simple\nflat channels is a relatively easy task for various simulation methods.\nHowever, once the channel geometry becomes more complex, numerical simulations\nbecome a bottleneck in optimizing wall geometries. We present a combination of\naccurate numerical simulations of arbitrary, flat, and non-flat channels and\nmachine learning models predicting drag coefficient and Stanton number. We show\nthat convolutional neural networks (CNN) can accurately predict the target\nproperties at a fraction of the time of numerical simulations. We use the CNN\nmodels in a virtual high-throughput screening approach to explore a large\nnumber of possible, randomly generated wall architectures. Data Augmentation\nwas applied to existing geometries data to add generated new training data\nwhich have the same number of parameters of heat transfer to improve the\nmodel's generalization. The general approach is not only applicable to simple\nflow setups as presented here but can be extended to more complex tasks, such\nas multiphase or even reactive unit operations in chemical engineering.\n","authors":["Yuri Koide","Arjun J. Kaithakkal","Matthias Schniewind","Bradley P. Ladewig","Alexander Stroh","Pascal Friederich"],"pdf_url":"https://arxiv.org/pdf/2101.08130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04268v1","updated":"2023-08-08T14:09:33Z","published":"2023-08-08T14:09:33Z","title":"Teacher-Student Architecture for Knowledge Distillation: A Survey","summary":"  Although Deep neural networks (DNNs) have shown a strong capacity to solve\nlarge-scale problems in many areas, such DNNs are hard to be deployed in\nreal-world systems due to their voluminous parameters. To tackle this issue,\nTeacher-Student architectures were proposed, where simple student networks with\na few parameters can achieve comparable performance to deep teacher networks\nwith many parameters. Recently, Teacher-Student architectures have been\neffectively and widely embraced on various knowledge distillation (KD)\nobjectives, including knowledge compression, knowledge expansion, knowledge\nadaptation, and knowledge enhancement. With the help of Teacher-Student\narchitectures, current studies are able to achieve multiple distillation\nobjectives through lightweight and generalized student networks. Different from\nexisting KD surveys that primarily focus on knowledge compression, this survey\nfirst explores Teacher-Student architectures across multiple distillation\nobjectives. This survey presents an introduction to various knowledge\nrepresentations and their corresponding optimization objectives. Additionally,\nwe provide a systematic overview of Teacher-Student architectures with\nrepresentative learning algorithms and effective distillation schemes. This\nsurvey also summarizes recent applications of Teacher-Student architectures\nacross multiple purposes, including classification, recognition, generation,\nranking, and regression. Lastly, potential research directions in KD are\ninvestigated, focusing on architecture design, knowledge quality, and\ntheoretical studies of regression-based learning, respectively. Through this\ncomprehensive survey, industry practitioners and the academic community can\ngain valuable insights and guidelines for effectively designing, learning, and\napplying Teacher-Student architectures on various distillation objectives.\n","authors":["Chengming Hu","Xuan Li","Dan Liu","Haolun Wu","Xi Chen","Ju Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04268v1.pdf","comment":"20 pages. arXiv admin note: substantial text overlap with\n  arXiv:2210.17332"},{"id":"http://arxiv.org/abs/2308.04263v1","updated":"2023-08-08T13:59:56Z","published":"2023-08-08T13:59:56Z","title":"BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning","summary":"  This paper introduces BarlowRL, a data-efficient reinforcement learning agent\nthat combines the Barlow Twins self-supervised learning framework with DER\n(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its\ncontrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids\ndimensional collapse by enforcing information spread to the whole space. This\nhelps RL algorithms to utilize uniformly spread state representation that\neventually results in a remarkable performance. The integration of Barlow Twins\nwith DER enhances data efficiency and achieves superior performance in the RL\ntasks. BarlowRL demonstrates the potential of incorporating self-supervised\nlearning techniques to improve RL algorithms.\n","authors":["Omer Veysel Cagatan"],"pdf_url":"https://arxiv.org/pdf/2308.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04262v1","updated":"2023-08-08T13:59:16Z","published":"2023-08-08T13:59:16Z","title":"SDLFormer: A Sparse and Dense Locality-enhanced Transformer for\n  Accelerated MR Image Reconstruction","summary":"  Transformers have emerged as viable alternatives to convolutional neural\nnetworks owing to their ability to learn non-local region relationships in the\nspatial domain. The self-attention mechanism of the transformer enables\ntransformers to capture long-range dependencies in the images, which might be\ndesirable for accelerated MRI image reconstruction as the effect of\nundersampling is non-local in the image domain. Despite its computational\nefficiency, the window-based transformers suffer from restricted receptive\nfields as the dependencies are limited to within the scope of the image\nwindows. We propose a window-based transformer network that integrates dilated\nattention mechanism and convolution for accelerated MRI image reconstruction.\nThe proposed network consists of dilated and dense neighborhood attention\ntransformers to enhance the distant neighborhood pixel relationship and\nintroduce depth-wise convolutions within the transformer module to learn\nlow-level translation invariant features for accelerated MRI image\nreconstruction. The proposed model is trained in a self-supervised manner. We\nperform extensive experiments for multi-coil MRI acceleration for coronal PD,\ncoronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in\nself-supervised learning based on k-space splitting. We compare our method\nagainst other reconstruction architectures and the parallel domain\nself-supervised learning baseline. Results show that the proposed model\nexhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in\nSSIM on average over other architectures (ii) around 1.44 dB in PSNR and around\n0.029 in SSIM over parallel domain self-supervised learning. The code is\navailable at https://github.com/rahul-gs-16/sdlformer.git\n","authors":["Rahul G. S.","Sriprabha Ramnarayanan","Mohammad Al Fahim","Keerthi Ram","Preejith S. P","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.04262v1.pdf","comment":"Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with\n  noisy and Limited Data"},{"id":"http://arxiv.org/abs/2308.04258v1","updated":"2023-08-08T13:46:55Z","published":"2023-08-08T13:46:55Z","title":"Advancing Natural-Language Based Audio Retrieval with PaSST and Large\n  Audio-Caption Data Sets","summary":"  This work presents a text-to-audio-retrieval system based on pre-trained text\nand spectrogram transformers. Our method projects recordings and textual\ndescriptions into a shared audio-caption space in which related examples from\ndifferent modalities are close. Through a systematic analysis, we examine how\neach component of the system influences retrieval performance. As a result, we\nidentify two key components that play a crucial role in driving performance:\nthe self-attention-based audio encoder for audio embedding and the utilization\nof additional human-generated and synthetic data sets during pre-training. We\nfurther experimented with augmenting ClothoV2 captions with available keywords\nto increase their variety; however, this only led to marginal improvements. Our\nsystem ranked first in the 2023's DCASE Challenge, and it outperforms the\ncurrent state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.\n","authors":["Paul Primus","Khaled Koutini","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2308.04258v1.pdf","comment":"submitted to DCASE Workshop 2023"},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04237v1","updated":"2023-08-08T13:03:36Z","published":"2023-08-08T13:03:36Z","title":"Federated Inference with Reliable Uncertainty Quantification over\n  Wireless Channels via Conformal Prediction","summary":"  Consider a setting in which devices and a server share a pre-trained model.\nThe server wishes to make an inference on a new input given the model. Devices\nhave access to data, previously not used for training, and can communicate to\nthe server over a common wireless channel. If the devices have no access to the\nnew input, can communication from devices to the server enhance the quality of\nthe inference decision at the server? Recent work has introduced federated\nconformal prediction (CP), which leverages devices-to-server communication to\nimprove the reliability of the server's decision. With federated CP, devices\ncommunicate to the server information about the loss accrued by the shared\npre-trained model on the local data, and the server leverages this information\nto calibrate a decision interval, or set, so that it is guaranteed to contain\nthe correct answer with a pre-defined target reliability level. Previous work\nassumed noise-free communication, whereby devices can communicate a single real\nnumber to the server. In this paper, we study for the first time federated CP\nin a wireless setting. We introduce a novel protocol, termed wireless federated\nconformal prediction (WFCP), which builds on type-based multiple access (TBMA)\nand on a novel quantile correction strategy. WFCP is proved to provide formal\nreliability guarantees in terms of coverage of the predicted set produced by\nthe server. Using numerical results, we demonstrate the significant advantages\nof WFCP against digital implementations of existing federated CP schemes,\nespecially in regimes with limited communication resources and/or large number\nof devices.\n","authors":["Meiyi Zhu","Matteo Zecchin","Sangwoo Park","Caili Guo","Chunyan Feng","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2308.04237v1.pdf","comment":"33 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.08134v3","updated":"2023-08-08T12:57:36Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n  Human-Machine Fusion Approach","summary":"  Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14353v3","updated":"2023-08-08T12:53:23Z","published":"2023-02-28T07:11:55Z","title":"A semantic backdoor attack against Graph Convolutional Networks","summary":"  Graph convolutional networks (GCNs) have been very effective in addressing\nthe issue of various graph-structured related tasks, such as node\nclassification and graph classification. However, recent research has shown\nthat GCNs are vulnerable to a new type of threat called a backdoor attack,\nwhere the adversary can inject a hidden backdoor into GCNs so that the attacked\nmodel performs well on benign samples, but its prediction will be maliciously\nchanged to the attacker-specified target label if the hidden backdoor is\nactivated by the attacker-defined trigger. In this paper, we investigate\nwhether such semantic backdoor attacks are possible for GCNs and propose a\nsemantic backdoor attack against GCNs (SBAG) under the context of graph\nclassification to reveal the existence of this security vulnerability in GCNs.\nSBAG uses a certain type of node in the samples as a backdoor trigger and\ninjects a hidden backdoor into GCN models by poisoning training data. The\nbackdoor will be activated, and the GCN models will give malicious\nclassification results specified by the attacker even on unmodified samples as\nlong as the samples contain enough trigger nodes. We evaluate SBAG on four\ngraph datasets. The experimental results indicate that SBAG can achieve attack\nsuccess rates of approximately 99.9% and over 82% for two kinds of attack\nsamples, respectively, with poisoning rates of less than 5%.\n","authors":["Jiazhu Dai","Zhipeng Xiong"],"pdf_url":"https://arxiv.org/pdf/2302.14353v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04226v1","updated":"2023-08-08T12:45:01Z","published":"2023-08-08T12:45:01Z","title":"OpinionConv: Conversational Product Search with Grounded Opinions","summary":"  When searching for products, the opinions of others play an important role in\nmaking informed decisions. Subjective experiences about a product can be a\nvaluable source of information. This is also true in sales conversations, where\na customer and a sales assistant exchange facts and opinions about products.\nHowever, training an AI for such conversations is complicated by the fact that\nlanguage models do not possess authentic opinions for their lack of real-world\nexperience. We address this problem by leveraging product reviews as a rich\nsource of product opinions to ground conversational AI in true subjective\nnarratives. With OpinionConv, we develop the first conversational AI for\nsimulating sales conversations. To validate the generated conversations, we\nconduct several user studies showing that the generated opinions are perceived\nas realistic. Our assessors also confirm the importance of opinions as an\ninformative basis for decision-making.\n","authors":["Vahid Sadiri Javadi","Martin Potthast","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.04226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04220v1","updated":"2023-08-08T12:34:32Z","published":"2023-08-08T12:34:32Z","title":"Semantic Interpretation and Validation of Graph Attention-based\n  Explanations for GNN Models","summary":"  In this work, we propose a methodology for investigating the application of\nsemantic attention to enhance the explainability of Graph Neural Network\n(GNN)-based models, introducing semantically-informed perturbations and\nestablishing a correlation between predicted feature-importance weights and\nmodel accuracy. Graph Deep Learning (GDL) has emerged as a promising field for\ntasks like scene interpretation, leveraging flexible graph structures to\nconcisely describe complex features and relationships. As traditional\nexplainability methods used in eXplainable AI (XAI) cannot be directly applied\nto such structures, graph-specific approaches are introduced. Attention\nmechanisms have demonstrated their efficacy in estimating the importance of\ninput features in deep learning models and thus have been previously employed\nto provide feature-based explanations for GNN predictions. Building upon these\ninsights, we extend existing attention-based graph-explainability methods\ninvestigating the use of attention weights as importance indicators of\nsemantically sorted feature sets. Through analysing the behaviour of predicted\nattention-weights distribution in correlation with model accuracy, we gain\nvaluable insights into feature importance with respect to the behaviour of the\nGNN model. We apply our methodology to a lidar pointcloud estimation model\nsuccessfully identifying key semantic classes that contribute to enhanced\nperformance effectively generating reliable post-hoc semantic explanations.\n","authors":["Efimia Panagiotaki","Daniele De Martini","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.04220v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.07909v2","updated":"2023-08-08T12:30:03Z","published":"2022-11-15T05:29:58Z","title":"Selective Memory Recursive Least Squares: Recast Forgetting into Memory\n  in RBF Neural Network Based Real-Time Learning","summary":"  In radial basis function neural network (RBFNN) based real-time learning\ntasks, forgetting mechanisms are widely used such that the neural network can\nkeep its sensitivity to new data. However, with forgetting mechanisms, some\nuseful knowledge will get lost simply because they are learned a long time ago,\nwhich we refer to as the passive knowledge forgetting phenomenon. To address\nthis problem, this paper proposes a real-time training method named selective\nmemory recursive least squares (SMRLS) in which the classical forgetting\nmechanisms are recast into a memory mechanism. Different from the forgetting\nmechanism, which mainly evaluates the importance of samples according to the\ntime when samples are collected, the memory mechanism evaluates the importance\nof samples through both temporal and spatial distribution of samples. With\nSMRLS, the input space of the RBFNN is evenly divided into a finite number of\npartitions and a synthesized objective function is developed using synthesized\nsamples from each partition. In addition to the current approximation error,\nthe neural network also updates its weights according to the recorded data from\nthe partition being visited. Compared with classical training methods including\nthe forgetting factor recursive least squares (FFRLS) and stochastic gradient\ndescent (SGD) methods, SMRLS achieves improved learning speed and\ngeneralization capability, which are demonstrated by corresponding simulation\nresults.\n","authors":["Yiming Fei","Jiangang Li","Yanan Li"],"pdf_url":"https://arxiv.org/pdf/2211.07909v2.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.04212v1","updated":"2023-08-08T12:22:09Z","published":"2023-08-08T12:22:09Z","title":"Varying-coefficients for regional quantile via KNN-based LASSO with\n  applications to health outcome study","summary":"  Health outcomes, such as body mass index and cholesterol levels, are known to\nbe dependent on age and exhibit varying effects with their associated risk\nfactors. In this paper, we propose a novel framework for dynamic modeling of\nthe associations between health outcomes and risk factors using\nvarying-coefficients (VC) regional quantile regression via K-nearest neighbors\n(KNN) fused Lasso, which captures the time-varying effects of age. The proposed\nmethod has strong theoretical properties, including a tight estimation error\nbound and the ability to detect exact clustered patterns under certain\nregularity conditions. To efficiently solve the resulting optimization problem,\nwe develop an alternating direction method of multipliers (ADMM) algorithm. Our\nempirical results demonstrate the efficacy of the proposed method in capturing\nthe complex age-dependent associations between health outcomes and their risk\nfactors.\n","authors":["Seyoung Park","Eun Ryung Lee","Hyokyoung G. Hong"],"pdf_url":"https://arxiv.org/pdf/2308.04212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.06926v4","updated":"2023-08-08T11:45:08Z","published":"2020-06-12T03:19:48Z","title":"Learning Bayesian Networks with Annealing Machine","summary":"  Recent studies have reported that annealing machines are capable of solving\ncombinatorial optimization problems with high accuracy. Annealing machines can\npotentially be applied to score-based Bayesian network structure learning.\nHowever, the bit capacity of an annealing machine is currently limited. To\nutilize the annealing technology, converting score-based learning problems into\nquadratic unconstrained binary optimizations within the bit capacity is\nnecessary. In this paper, we propose an efficient conversion method with the\nadvanced identification of candidate parent sets and their decomposition. We\nalso provide an integer programming problem to find the decomposition that\nminimizes the number of required bits. Experimental results on $7$ benchmark\ndatasets with variables from $75$ to $223$ show that our approach requires less\nbits than the $100$K bit capacity of the fourth-generation Fujitsu Digital\nAnnealer, a fully coupled annealing machine developed with semiconductor\ntechnology. Moreover, we demonstrate that the Digital Annealer with our\nconversion method outperforms existing algorithms on score maximization. These\nresults highlight the utility of annealing processors in learning Bayesian\nnetworks.\n","authors":["Yuta Shikuri"],"pdf_url":"https://arxiv.org/pdf/2006.06926v4.pdf","comment":"13 pages, 5 tables, 3 figures, NeurIPS 2023 (under review)"},{"id":"http://arxiv.org/abs/2303.00286v3","updated":"2023-08-08T11:34:24Z","published":"2023-03-01T07:25:28Z","title":"Treat Different Negatives Differently: Enriching Loss Functions with\n  Domain and Range Constraints for Link Prediction","summary":"  Knowledge graph embedding models (KGEMs) are used for various tasks related\nto knowledge graphs (KGs), including link prediction. They are trained with\nloss functions that are computed considering a batch of scored triples and\ntheir corresponding labels. Traditional approaches consider the label of a\ntriple to be either true or false. However, recent works suggest that all\nnegative triples should not be valued equally. In line with this recent\nassumption, we posit that negative triples that are semantically valid w.r.t.\ndomain and range constraints might be high-quality negative triples. As such,\nloss functions should treat them differently from semantically invalid negative\nones. To this aim, we propose semantic-driven versions for the three main loss\nfunctions for link prediction. In an extensive and controlled experimental\nsetting, we show that the proposed loss functions systematically provide\nsatisfying results on three public benchmark KGs underpinned with different\nschemas, which demonstrates both the generality and superiority of our proposed\napproach. In fact, the proposed loss functions do (1) lead to better MRR and\nHits@10 values, (2) drive KGEMs towards better semantic awareness as measured\nby the Sem@K metric. This highlights that semantic information globally\nimproves KGEMs, and thus should be incorporated into loss functions. Domains\nand ranges of relations being largely available in schema-defined KGs, this\nmakes our approach both beneficial and widely usable in practice.\n","authors":["Nicolas Hubert","Pierre Monnin","Armelle Brun","Davy Monticolo"],"pdf_url":"https://arxiv.org/pdf/2303.00286v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04185v1","updated":"2023-08-08T11:10:42Z","published":"2023-08-08T11:10:42Z","title":"Iterative Sketching for Secure Coded Regression","summary":"  In this work, we propose methods for speeding up linear regression\ndistributively, while ensuring security. We leverage randomized sketching\ntechniques, and improve straggler resilience in asynchronous systems.\nSpecifically, we apply a random orthonormal matrix and then subsample\n\\textit{blocks}, to simultaneously secure the information and reduce the\ndimension of the regression problem. In our setup, the transformation\ncorresponds to an encoded encryption in an \\textit{approximate gradient coding\nscheme}, and the subsampling corresponds to the responses of the non-straggling\nworkers; in a centralized coded computing network. This results in a\ndistributive \\textit{iterative sketching} approach for an $\\ell_2$-subspace\nembedding, \\textit{i.e.} a new sketch is considered at each iteration. We also\nfocus on the special case of the \\textit{Subsampled Randomized Hadamard\nTransform}, which we generalize to block sampling; and discuss how it can be\nmodified in order to secure the data.\n","authors":["Neophytos Charalambides","Hessam Mahdavifar","Mert Pilanci","Alfred O. Hero III"],"pdf_url":"https://arxiv.org/pdf/2308.04185v1.pdf","comment":"28 pages, 7 figures. arXiv admin note: substantial text overlap with\n  arXiv:2201.08522"},{"id":"http://arxiv.org/abs/2111.10275v3","updated":"2023-08-08T11:05:04Z","published":"2021-11-19T15:25:06Z","title":"Composite Goodness-of-fit Tests with Kernels","summary":"  Model misspecification can create significant challenges for the\nimplementation of probabilistic models, and this has led to development of a\nrange of robust methods which directly account for this issue. However, whether\nthese more involved methods are required will depend on whether the model is\nreally misspecified, and there is a lack of generally applicable methods to\nanswer this question. In this paper, we propose one such method. More\nprecisely, we propose kernel-based hypothesis tests for the challenging\ncomposite testing problem, where we are interested in whether the data comes\nfrom any distribution in some parametric family. Our tests make use of minimum\ndistance estimators based on the maximum mean discrepancy and the kernel Stein\ndiscrepancy. They are widely applicable, including whenever the density of the\nparametric model is known up to normalisation constant, or if the model takes\nthe form of a simulator. As our main result, we show that we are able to\nestimate the parameter and conduct our test on the same data (without data\nsplitting), while maintaining a correct test level. Our approach is illustrated\non a range of problems, including testing for goodness-of-fit of an\nunnormalised non-parametric density model, and an intractable generative model\nof a biological cellular network.\n","authors":["Oscar Key","Arthur Gretton","François-Xavier Briol","Tamara Fernandez"],"pdf_url":"https://arxiv.org/pdf/2111.10275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04180v1","updated":"2023-08-08T10:42:33Z","published":"2023-08-08T10:42:33Z","title":"Studying Socially Unacceptable Discourse Classification (SUD) through\n  different eyes: \"Are we on the same page ?\"","summary":"  We study Socially Unacceptable Discourse (SUD) characterization and detection\nin online text. We first build and present a novel corpus that contains a large\nvariety of manually annotated texts from different online sources used so far\nin state-of-the-art Machine learning (ML) SUD detection solutions. This global\ncontext allows us to test the generalization ability of SUD classifiers that\nacquire knowledge around the same SUD categories, but from different contexts.\nFrom this perspective, we can analyze how (possibly) different annotation\nmodalities influence SUD learning by discussing open challenges and open\nresearch directions. We also provide several data insights which can support\ndomain experts in the annotation task.\n","authors":["Bruno Machado Carneiro","Michele Linardi","Julien Longhi"],"pdf_url":"https://arxiv.org/pdf/2308.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14915v2","updated":"2023-08-08T10:30:54Z","published":"2022-09-29T16:22:46Z","title":"Spiking Neural Networks for event-based action recognition: A new task\n  to understand their advantage","summary":"  Spiking Neural Networks (SNN) are characterised by their unique temporal\ndynamics, but the properties and advantages of such computations are still not\nwell understood. In order to provide answers, in this work we demonstrate how\nSpiking neurons can enable temporal feature extraction in feed-forward neural\nnetworks without the need for recurrent synapses, showing how their\nbio-inspired computing principles can be successfully exploited beyond energy\nefficiency gains and evidencing their differences with respect to conventional\nneurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain\n(DVS-GC), which allows, for the first time, to evaluate the perception of\ntemporal dependencies in a real event-based action recognition dataset. Our\nstudy proves how the widely used DVS Gesture benchmark could be solved by\nnetworks without temporal feature extraction, unlike the new DVS-GC which\ndemands an understanding of the ordering of the events. Furthermore, this setup\nallowed us to unveil the role of the leakage rate in spiking neurons for\ntemporal processing tasks and demonstrated the benefits of \"hard reset\"\nmechanisms. Additionally, we also show how time-dependent weights and\nnormalization can lead to understanding order by means of temporal attention.\n","authors":["Alex Vicente-Sola","Davide L. Manna","Paul Kirkland","Gaetano Di Caterina","Trevor Bihl"],"pdf_url":"https://arxiv.org/pdf/2209.14915v2.pdf","comment":"New article superseding the one in previous versions"},{"id":"http://arxiv.org/abs/2301.10227v2","updated":"2023-08-08T10:18:04Z","published":"2023-01-02T14:17:08Z","title":"Denoising Diffusion Probabilistic Models for Generation of Realistic\n  Fully-Annotated Microscopy Image Data Sets","summary":"  Recent advances in computer vision have led to significant progress in the\ngeneration of realistic image data, with denoising diffusion probabilistic\nmodels proving to be a particularly effective method. In this study, we\ndemonstrate that diffusion models can effectively generate fully-annotated\nmicroscopy image data sets through an unsupervised and intuitive approach,\nusing rough sketches of desired structures as the starting point. The proposed\npipeline helps to reduce the reliance on manual annotations when training deep\nlearning-based segmentation approaches and enables the segmentation of diverse\ndatasets without the need for human annotations. This approach holds great\npromise in streamlining the data generation process and enabling a more\nefficient and scalable training of segmentation models, as we show in the\nexample of different practical experiments involving various organisms and cell\ntypes.\n","authors":["Dennis Eschweiler","Rüveyda Yilmaz","Matisse Baumann","Ina Laube","Rijo Roy","Abin Jose","Daniel Brückner","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2301.10227v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.05609v4","updated":"2023-08-08T10:04:14Z","published":"2023-01-13T15:24:40Z","title":"Co-manipulation of soft-materials estimating deformation from depth\n  images","summary":"  Human-robot co-manipulation of soft materials, such as fabrics, composites,\nand sheets of paper/cardboard, is a challenging operation that presents several\nrelevant industrial applications. Estimating the deformation state of the\nco-manipulated material is one of the main challenges. Viable methods provide\nthe indirect measure by calculating the human-robot relative distance. In this\npaper, we develop a data-driven model to estimate the deformation state of the\nmaterial from a depth image through a Convolutional Neural Network (CNN).\nFirst, we define the deformation state of the material as the relative\nroto-translation from the current robot pose and a human grasping position. The\nmodel estimates the current deformation state through a Convolutional Neural\nNetwork, specifically a DenseNet-121 pretrained on ImageNet.The delta between\nthe current and the desired deformation state is fed to the robot controller\nthat outputs twist commands. The paper describes the developed approach to\nacquire, preprocess the dataset and train the model. The model is compared with\nthe current state-of-the-art method based on a skeletal tracker from cameras.\nResults show that our approach achieves better performances and avoids the\nvarious drawbacks caused by using a skeletal tracker.Finally, we also studied\nthe model performance according to different architectures and dataset\ndimensions to minimize the time required for dataset acquisition\n","authors":["Giorgio Nicola","Enrico Villagrossi","Nicola Pedrocchi"],"pdf_url":"https://arxiv.org/pdf/2301.05609v4.pdf","comment":"Pre-print, Accepted to Robotics and Computer Integrated Manufacturing"},{"id":"http://arxiv.org/abs/2308.04169v1","updated":"2023-08-08T09:59:56Z","published":"2023-08-08T09:59:56Z","title":"Dual input neural networks for positional sound source localization","summary":"  In many signal processing applications, metadata may be advantageously used\nin conjunction with a high dimensional signal to produce a desired output. In\nthe case of classical Sound Source Localization (SSL) algorithms, information\nfrom a high dimensional, multichannel audio signals received by many\ndistributed microphones is combined with information describing acoustic\nproperties of the scene, such as the microphones' coordinates in space, to\nestimate the position of a sound source. We introduce Dual Input Neural\nNetworks (DI-NNs) as a simple and effective way to model these two data types\nin a neural network. We train and evaluate our proposed DI-NN on scenarios of\nvarying difficulty and realism and compare it against an alternative\narchitecture, a classical Least-Squares (LS) method as well as a classical\nConvolutional Recurrent Neural Network (CRNN). Our results show that the DI-NN\nsignificantly outperforms the baselines, achieving a five times lower\nlocalization error than the LS method and two times lower than the CRNN in a\ntest dataset of real recordings.\n","authors":["Eric Grinstein","Vincent W. Neo","Patrick A. Naylor"],"pdf_url":"https://arxiv.org/pdf/2308.04169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02632v2","updated":"2023-08-08T09:21:40Z","published":"2023-08-04T17:44:27Z","title":"Generation of Realistic Synthetic Raw Radar Data for Automated Driving\n  Applications using Generative Adversarial Networks","summary":"  The main approaches for simulating FMCW radar are based on ray tracing, which\nis usually computationally intensive and do not account for background noise.\nThis work proposes a faster method for FMCW radar simulation capable of\ngenerating synthetic raw radar data using generative adversarial networks\n(GAN). The code and pre-trained weights are open-source and available on\nGitHub. This method generates 16 simultaneous chirps, which allows the\ngenerated data to be used for the further development of algorithms for\nprocessing radar data (filtering and clustering). This can increase the\npotential for data augmentation, e.g., by generating data in non-existent or\nsafety-critical scenarios that are not reproducible in real life. In this work,\nthe GAN was trained with radar measurements of a motorcycle and used to\ngenerate synthetic raw radar data of a motorcycle traveling in a straight line.\nFor generating this data, the distance of the motorcycle and Gaussian noise are\nused as input to the neural network. The synthetic generated radar chirps were\nevaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth\n(RA) map is calculated twice: first, based on synthetic data using this GAN\nand, second, based on real data. Based on these RA maps, an algorithm with\nadaptive threshold and edge detection is used for object detection. The results\nhave shown that the data is realistic in terms of coherent radar reflections of\nthe motorcycle and background noise based on the comparison of chirps, the RA\nmaps and the object detection results. Thus, the proposed method in this work\nhas shown to minimize the simulation-to-reality gap for the generation of radar\ndata.\n","authors":["Eduardo C. Fidelis","Fabio Reway","Herick Y. S. Ribeiro","Pietro L. Campos","Werner Huber","Christian Icking","Lester A. Faria","Torsten Schön"],"pdf_url":"https://arxiv.org/pdf/2308.02632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08325v2","updated":"2023-08-08T09:08:01Z","published":"2023-06-14T07:54:53Z","title":"GCformer: An Efficient Framework for Accurate and Scalable Long-Term\n  Multivariate Time Series Forecasting","summary":"  Transformer-based models have emerged as promising tools for time series\nforecasting.\n  However, these model cannot make accurate prediction for long input time\nseries. On the one hand, they failed to capture global dependencies within time\nseries data. On the other hand, the long input sequence usually leads to large\nmodel size and high time complexity.\n  To address these limitations, we present GCformer, which combines a\nstructured global convolutional branch for processing long input sequences with\na local Transformer-based branch for capturing short, recent signals. A\ncohesive framework for a global convolution kernel has been introduced,\nutilizing three distinct parameterization methods. The selected structured\nconvolutional kernel in the global branch has been specifically crafted with\nsublinear complexity, thereby allowing for the efficient and effective\nprocessing of lengthy and noisy input signals. Empirical studies on six\nbenchmark datasets demonstrate that GCformer outperforms state-of-the-art\nmethods, reducing MSE error in multivariate time series benchmarks by 4.38% and\nmodel parameters by 61.92%. In particular, the global convolutional branch can\nserve as a plug-in block to enhance the performance of other models, with an\naverage improvement of 31.93\\%, including various recently published\nTransformer-based models. Our code is publicly available at\nhttps://github.com/zyj-111/GCformer.\n","authors":["YanJun Zhao","Ziqing Ma","Tian Zhou","Liang Sun","Mengni Ye","Yi Qian"],"pdf_url":"https://arxiv.org/pdf/2306.08325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02582v2","updated":"2023-08-08T08:57:20Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v2.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2206.01186v2","updated":"2023-08-08T08:51:45Z","published":"2022-06-01T10:28:18Z","title":"ORC: Network Group-based Knowledge Distillation using Online Role Change","summary":"  In knowledge distillation, since a single, omnipotent teacher network cannot\nsolve all problems, multiple teacher-based knowledge distillations have been\nstudied recently. However, sometimes their improvements are not as good as\nexpected because some immature teachers may transfer the false knowledge to the\nstudent. In this paper, to overcome this limitation and take the efficacy of\nthe multiple networks, we divide the multiple networks into teacher and student\ngroups, respectively. That is, the student group is a set of immature networks\nthat require learning the teacher's knowledge, while the teacher group consists\nof the selected networks that are capable of teaching successfully. We propose\nour online role change strategy where the top-ranked networks in the student\ngroup are able to promote to the teacher group at every iteration. After\ntraining the teacher group using the error samples of the student group to\nrefine the teacher group's knowledge, we transfer the collaborative knowledge\nfrom the teacher group to the student group successfully. We verify the\nsuperiority of the proposed method on CIFAR-10, CIFAR-100, and ImageNet which\nachieves high performance. We further show the generality of our method with\nvarious backbone architectures such as ResNet, WRN, VGG, Mobilenet, and\nShufflenet.\n","authors":["Junyong Choi","Hyeon Cho","Seokhwa Cheung","Wonjun Hwang"],"pdf_url":"https://arxiv.org/pdf/2206.01186v2.pdf","comment":"Accepted at ICCV 2023; Supplementary material would be found at CVF\n  Open Access"},{"id":"http://arxiv.org/abs/2308.04137v1","updated":"2023-08-08T08:50:27Z","published":"2023-08-08T08:50:27Z","title":"Comprehensive Assessment of the Performance of Deep Learning Classifiers\n  Reveals a Surprising Lack of Robustness","summary":"  Reliable and robust evaluation methods are a necessary first step towards\ndeveloping machine learning models that are themselves robust and reliable.\nUnfortunately, current evaluation protocols typically used to assess\nclassifiers fail to comprehensively evaluate performance as they tend to rely\non limited types of test data, and ignore others. For example, using the\nstandard test data fails to evaluate the predictions made by the classifier to\nsamples from classes it was not trained on. On the other hand, testing with\ndata containing samples from unknown classes fails to evaluate how well the\nclassifier can predict the labels for known classes. This article advocates\nbench-marking performance using a wide range of different types of data and\nusing a single metric that can be applied to all such data types to produce a\nconsistent evaluation of performance. Using such a benchmark it is found that\ncurrent deep neural networks, including those trained with methods that are\nbelieved to produce state-of-the-art robustness, are extremely vulnerable to\nmaking mistakes on certain types of data. This means that such models will be\nunreliable in real-world scenarios where they may encounter data from many\ndifferent domains, and that they are insecure as they can easily be fooled into\nmaking the wrong decisions. It is hoped that these results will motivate the\nwider adoption of more comprehensive testing methods that will, in turn, lead\nto the development of more robust machine learning methods in the future.\n  Code is available at:\n\\url{https://codeberg.org/mwspratling/RobustnessEvaluation}\n","authors":["Michael W. Spratling"],"pdf_url":"https://arxiv.org/pdf/2308.04137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18651v3","updated":"2023-08-08T08:48:48Z","published":"2023-05-29T23:06:05Z","title":"UMD: Unsupervised Model Detection for X2X Backdoor Attacks","summary":"  Backdoor (Trojan) attack is a common threat to deep neural networks, where\nsamples from one or more source classes embedded with a backdoor trigger will\nbe misclassified to adversarial target classes. Existing methods for detecting\nwhether a classifier is backdoor attacked are mostly designed for attacks with\na single adversarial target (e.g., all-to-one attack). To the best of our\nknowledge, without supervision, no existing methods can effectively address the\nmore general X2X attack with an arbitrary number of source classes, each paired\nwith an arbitrary target class. In this paper, we propose UMD, the first\nUnsupervised Model Detection method that effectively detects X2X backdoor\nattacks via a joint inference of the adversarial (source, target) class pairs.\nIn particular, we first define a novel transferability statistic to measure and\nselect a subset of putative backdoor class pairs based on a proposed clustering\napproach. Then, these selected class pairs are jointly assessed based on an\naggregation of their reverse-engineered trigger size for detection inference,\nusing a robust and unsupervised anomaly detector we proposed. We conduct\ncomprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show\nthat our unsupervised UMD outperforms SOTA detectors (even with supervision) by\n17%, 4%, and 8%, respectively, in terms of the detection accuracy against\ndiverse X2X attacks. We also show the strong detection performance of UMD\nagainst several strong adaptive attacks.\n","authors":["Zhen Xiang","Zidi Xiong","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2305.18651v3.pdf","comment":"Proceedings of the 40th International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04119v1","updated":"2023-08-08T08:19:43Z","published":"2023-08-08T08:19:43Z","title":"Constructing Custom Thermodynamics Using Deep Learning","summary":"  One of the most exciting applications of AI is automated scientific discovery\nbased on previously amassed data, coupled with restrictions provided by the\nknown physical principles, including symmetries and conservation laws. Such\nautomated hypothesis creation and verification can assist scientists in\nstudying complex phenomena, where traditional physical intuition may fail. Of\nparticular importance are complex dynamic systems where their time evolution is\nstrongly influenced by varying external parameters. In this paper we develop a\nplatform based on a generalised Onsager principle to learn macroscopic\ndynamical descriptions of arbitrary stochastic dissipative systems directly\nfrom observations of their microscopic trajectories. We focus on systems whose\ncomplexity and sheer sizes render complete microscopic description impractical,\nand constructing theoretical macroscopic models requires extensive domain\nknowledge or trial-and-error. Our machine learning approach addresses this by\nsimultaneously constructing reduced thermodynamic coordinates and interpreting\nthe dynamics on these coordinates. We demonstrate our method by studying\ntheoretically and validating experimentally, the stretching of long polymer\nchains in an externally applied field. Specifically, we learn three\ninterpretable thermodynamic coordinates and build a dynamical landscape of\npolymer stretching, including (1) the identification of stable and transition\nstates and (2) the control of the stretching rate. We further demonstrate the\nuniversality of our approach by applying it to an unrelated problem in a\ndifferent domain: constructing macroscopic dynamics for spatial epidemics,\nshowing that our method addresses wide scientific and technological\napplications.\n","authors":["Xiaoli Chen","Beatrice W. Soh","Zi-En Ooi","Eleonore Vissol-Gaudin","Haijun Yu","Kostya S. Novoselov","Kedar Hippalgaonkar","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.04119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1910.06832v3","updated":"2023-08-08T07:50:36Z","published":"2019-10-15T14:47:37Z","title":"Discriminator optimal transport","summary":"  Within a broad class of generative adversarial networks, we show that\ndiscriminator optimization process increases a lower bound of the dual cost\nfunction for the Wasserstein distance between the target distribution $p$ and\nthe generator distribution $p_G$. It implies that the trained discriminator can\napproximate optimal transport (OT) from $p_G$ to $p$.Based on some experiments\nand a bit of OT theory, we propose a discriminator optimal transport (DOT)\nscheme to improve generated images. We show that it improves inception score\nand FID calculated by un-conditional GAN trained by CIFAR-10, STL-10 and a\npublic pre-trained model of conditional GAN by ImageNet.\n","authors":["Akinori Tanaka"],"pdf_url":"https://arxiv.org/pdf/1910.06832v3.pdf","comment":"math errors corrected, note added"},{"id":"http://arxiv.org/abs/2308.04103v1","updated":"2023-08-08T07:38:44Z","published":"2023-08-08T07:38:44Z","title":"Explainable machine learning to enable high-throughput electrical\n  conductivity optimization of doped conjugated polymers","summary":"  The combination of high-throughput experimentation techniques and machine\nlearning (ML) has recently ushered in a new era of accelerated material\ndiscovery, enabling the identification of materials with cutting-edge\nproperties. However, the measurement of certain physical quantities remains\nchallenging to automate. Specifically, meticulous process control,\nexperimentation and laborious measurements are required to achieve optimal\nelectrical conductivity in doped polymer materials. We propose a ML approach,\nwhich relies on readily measured absorbance spectra, to accelerate the workflow\nassociated with measuring electrical conductivity. The first ML model\n(classification model), accurately classifies samples with a conductivity >~25\nto 100 S/cm, achieving a maximum of 100% accuracy rate. For the subset of\nhighly conductive samples, we employed a second ML model (regression model), to\npredict their conductivities, yielding an impressive test R2 value of 0.984. To\nvalidate the approach, we showed that the models, neither trained on the\nsamples with the two highest conductivities of 498 and 506 S/cm, were able to,\nin an extrapolative manner, correctly classify and predict them at satisfactory\nlevels of errors. The proposed ML workflow results in an improvement in the\nefficiency of the conductivity measurements by 89% of the maximum achievable\nusing our experimental techniques. Furthermore, our approach addressed the\ncommon challenge of the lack of explainability in ML models by exploiting\nbespoke mathematical properties of the descriptors and ML model, allowing us to\ngain corroborated insights into the spectral influences on conductivity.\nThrough this study, we offer an accelerated pathway for optimizing the\nproperties of doped polymer materials while showcasing the valuable insights\nthat can be derived from purposeful utilization of ML in experimental science.\n","authors":["Ji Wei Yoon","Adithya Kumar","Pawan Kumar","Kedar Hippalgaonkar","J Senthilnath","Vijila Chellappan"],"pdf_url":"https://arxiv.org/pdf/2308.04103v1.pdf","comment":"33 Pages, 17 figures"},{"id":"http://arxiv.org/abs/2308.04102v1","updated":"2023-08-08T07:33:49Z","published":"2023-08-08T07:33:49Z","title":"Asynchronous Evolution of Deep Neural Network Architectures","summary":"  Many evolutionary algorithms (EAs) take advantage of parallel evaluation of\ncandidates. However, if evaluation times vary significantly, many worker nodes\n(i.e.,\\ compute clients) are idle much of the time, waiting for the next\ngeneration to be created. Evolutionary neural architecture search (ENAS), a\nclass of EAs that optimizes the architecture and hyperparameters of deep neural\nnetworks, is particularly vulnerable to this issue. This paper proposes a\ngeneric asynchronous evaluation strategy (AES) that is then adapted to work\nwith ENAS. AES increases throughput by maintaining a queue of upto $K$\nindividuals ready to be sent to the workers for evaluation and proceeding to\nthe next generation as soon as $M<<K$ individuals have been evaluated by the\nworkers. A suitable value for $M$ is determined experimentally, balancing\ndiversity and efficiency. To showcase the generality and power of AES, it was\nfirst evaluated in 11-bit multiplexer design (a single-population verifiable\ndiscovery task) and then scaled up to ENAS for image captioning (a\nmulti-population open-ended-optimization task). In both problems, a multifold\nperformance improvement was observed, suggesting that AES is a promising method\nfor parallelizing the evolution of complex systems with long and variable\nevaluation times, such as those in ENAS.\n","authors":["Jason Liang","Hormoz Shahrzad","Risto Miikkulainen"],"pdf_url":"https://arxiv.org/pdf/2308.04102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04082v1","updated":"2023-08-08T06:41:10Z","published":"2023-08-08T06:41:10Z","title":"Application-Oriented Benchmarking of Quantum Generative Learning Using\n  QUARK","summary":"  Benchmarking of quantum machine learning (QML) algorithms is challenging due\nto the complexity and variability of QML systems, e.g., regarding model\nansatzes, data sets, training techniques, and hyper-parameters selection. The\nQUantum computing Application benchmaRK (QUARK) framework simplifies and\nstandardizes benchmarking studies for quantum computing applications. Here, we\npropose several extensions of QUARK to include the ability to evaluate the\ntraining and deployment of quantum generative models. We describe the updated\nsoftware architecture and illustrate its flexibility through several example\napplications: (1) We trained different quantum generative models using several\ncircuit ansatzes, data sets, and data transformations. (2) We evaluated our\nmodels on GPU and real quantum hardware. (3) We assessed the generalization\ncapabilities of our generative models using a broad set of metrics that\ncapture, e.g., the novelty and validity of the generated data.\n","authors":["Florian J. Kiwit","Marwa Marso","Philipp Ross","Carlos A. Riofrío","Johannes Klepsch","Andre Luckow"],"pdf_url":"https://arxiv.org/pdf/2308.04082v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04077v1","updated":"2023-08-08T06:26:54Z","published":"2023-08-08T06:26:54Z","title":"Federated Zeroth-Order Optimization using Trajectory-Informed Surrogate\n  Gradients","summary":"  Federated optimization, an emerging paradigm which finds wide real-world\napplications such as federated learning, enables multiple clients (e.g., edge\ndevices) to collaboratively optimize a global function. The clients do not\nshare their local datasets and typically only share their local gradients.\nHowever, the gradient information is not available in many applications of\nfederated optimization, which hence gives rise to the paradigm of federated\nzeroth-order optimization (ZOO). Existing federated ZOO algorithms suffer from\nthe limitations of query and communication inefficiency, which can be\nattributed to (a) their reliance on a substantial number of function queries\nfor gradient estimation and (b) the significant disparity between their\nrealized local updates and the intended global updates. To this end, we (a)\nintroduce trajectory-informed gradient surrogates which is able to use the\nhistory of function queries during optimization for accurate and\nquery-efficient gradient estimation, and (b) develop the technique of adaptive\ngradient correction using these gradient surrogates to mitigate the\naforementioned disparity. Based on these, we propose the federated zeroth-order\noptimization using trajectory-informed surrogate gradients (FZooS) algorithm\nfor query- and communication-efficient federated ZOO. Our FZooS achieves\ntheoretical improvements over the existing approaches, which is supported by\nour real-world experiments such as federated black-box adversarial attack and\nfederated non-differentiable metric optimization.\n","authors":["Yao Shu","Xiaoqiang Lin","Zhongxiang Dai","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2308.04077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04073v1","updated":"2023-08-08T06:11:52Z","published":"2023-08-08T06:11:52Z","title":"Learning Specialized Activation Functions for Physics-informed Neural\n  Networks","summary":"  Physics-informed neural networks (PINNs) are known to suffer from\noptimization difficulty. In this work, we reveal the connection between the\noptimization difficulty of PINNs and activation functions. Specifically, we\nshow that PINNs exhibit high sensitivity to activation functions when solving\nPDEs with distinct properties. Existing works usually choose activation\nfunctions by inefficient trial-and-error. To avoid the inefficient manual\nselection and to alleviate the optimization difficulty of PINNs, we introduce\nadaptive activation functions to search for the optimal function when solving\ndifferent problems. We compare different adaptive activation functions and\ndiscuss their limitations in the context of PINNs. Furthermore, we propose to\ntailor the idea of learning combinations of candidate activation functions to\nthe PINNs optimization, which has a higher requirement for the smoothness and\ndiversity on learned functions. This is achieved by removing activation\nfunctions which cannot provide higher-order derivatives from the candidate set\nand incorporating elementary functions with different properties according to\nour prior knowledge about the PDE at hand. We further enhance the search space\nwith adaptive slopes. The proposed adaptive activation function can be used to\nsolve different PDE systems in an interpretable way. Its effectiveness is\ndemonstrated on a series of benchmarks. Code is available at\nhttps://github.com/LeapLabTHU/AdaAFforPINNs.\n","authors":["Honghui Wang","Lu Lu","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04071v1","updated":"2023-08-08T06:10:53Z","published":"2023-08-08T06:10:53Z","title":"Path Signatures for Diversity in Probabilistic Trajectory Optimisation","summary":"  Motion planning can be cast as a trajectory optimisation problem where a cost\nis minimised as a function of the trajectory being generated. In complex\nenvironments with several obstacles and complicated geometry, this optimisation\nproblem is usually difficult to solve and prone to local minima. However,\nrecent advancements in computing hardware allow for parallel trajectory\noptimisation where multiple solutions are obtained simultaneously, each\ninitialised from a different starting point. Unfortunately, without a strategy\npreventing two solutions to collapse on each other, naive parallel optimisation\ncan suffer from mode collapse diminishing the efficiency of the approach and\nthe likelihood of finding a global solution. In this paper we leverage on\nrecent advances in the theory of rough paths to devise an algorithm for\nparallel trajectory optimisation that promotes diversity over the range of\nsolutions, therefore avoiding mode collapses and achieving better global\nproperties. Our approach builds on path signatures and Hilbert space\nrepresentations of trajectories, and connects parallel variational inference\nfor trajectory estimation with diversity promoting kernels. We empirically\ndemonstrate that this strategy achieves lower average costs than competing\nalternatives on a range of problems, from 2D navigation to robotic manipulators\noperating in cluttered environments.\n","authors":["Lucas Barcelos","Tin Lai","Rafael Oliveira","Paulo Borges","Fabio Ramos"],"pdf_url":"https://arxiv.org/pdf/2308.04071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04070v1","updated":"2023-08-08T06:07:49Z","published":"2023-08-08T06:07:49Z","title":"ConDistFL: Conditional Distillation for Federated Learning from\n  Partially Annotated Data","summary":"  Developing a generalized segmentation model capable of simultaneously\ndelineating multiple organs and diseases is highly desirable. Federated\nlearning (FL) is a key technology enabling the collaborative development of a\nmodel without exchanging training data. However, the limited access to fully\nannotated training data poses a major challenge to training generalizable\nmodels. We propose \"ConDistFL\", a framework to solve this problem by combining\nFL with knowledge distillation. Local models can extract the knowledge of\nunlabeled organs and tumors from partially annotated data from the global model\nwith an adequately designed conditional probability representation. We validate\nour framework on four distinct partially annotated abdominal CT datasets from\nthe MSD and KiTS19 challenges. The experimental results show that the proposed\nframework significantly outperforms FedAvg and FedOpt baselines. Moreover, the\nperformance on an external test dataset demonstrates superior generalizability\ncompared to models trained on each dataset separately. Our ablation study\nsuggests that ConDistFL can perform well without frequent aggregation, reducing\nthe communication cost of FL. Our implementation will be available at\nhttps://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.\n","authors":["Pochuan Wang","Chen Shen","Weichung Wang","Masahiro Oda","Chiou-Shann Fuh","Kensaku Mori","Holger R. Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05785v2","updated":"2023-08-08T06:06:35Z","published":"2022-09-13T07:37:53Z","title":"Adversarial Coreset Selection for Efficient Robust Training","summary":"  Neural networks are vulnerable to adversarial attacks: adding well-crafted,\nimperceptible perturbations to their input can modify their output. Adversarial\ntraining is one of the most effective approaches to training robust models\nagainst such attacks. Unfortunately, this method is much slower than vanilla\ntraining of neural networks since it needs to construct adversarial examples\nfor the entire training data at every iteration. By leveraging the theory of\ncoreset selection, we show how selecting a small subset of training data\nprovides a principled approach to reducing the time complexity of robust\ntraining. To this end, we first provide convergence guarantees for adversarial\ncoreset selection. In particular, we show that the convergence bound is\ndirectly related to how well our coresets can approximate the gradient computed\nover the entire training data. Motivated by our theoretical analysis, we\npropose using this gradient approximation error as our adversarial coreset\nselection objective to reduce the training set size effectively. Once built, we\nrun adversarial training over this subset of the training data. Unlike existing\nmethods, our approach can be adapted to a wide variety of training objectives,\nincluding TRADES, $\\ell_p$-PGD, and Perceptual Adversarial Training. We conduct\nextensive experiments to demonstrate that our approach speeds up adversarial\ntraining by 2-3 times while experiencing a slight degradation in the clean and\nrobust accuracy.\n","authors":["Hadi M. Dolatabadi","Sarah Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2209.05785v2.pdf","comment":"Accepted to the International Journal of Computer Vision (IJCV).\n  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:\n  substantial text overlap with arXiv:2112.00378"},{"id":"http://arxiv.org/abs/2305.01160v3","updated":"2023-08-08T05:59:58Z","published":"2023-05-02T02:29:18Z","title":"Long-Tailed Recognition by Mutual Information Maximization between\n  Latent Features and Ground-Truth Labels","summary":"  Although contrastive learning methods have shown prevailing performance on a\nvariety of representation learning tasks, they encounter difficulty when the\ntraining dataset is long-tailed. Many researchers have combined contrastive\nlearning and a logit adjustment technique to address this problem, but the\ncombinations are done ad-hoc and a theoretical background has not yet been\nprovided. The goal of this paper is to provide the background and further\nimprove the performance. First, we show that the fundamental reason contrastive\nlearning methods struggle with long-tailed tasks is that they try to maximize\nthe mutual information maximization between latent features and input data. As\nground-truth labels are not considered in the maximization, they are not able\nto address imbalances between class labels. Rather, we interpret the\nlong-tailed recognition task as a mutual information maximization between\nlatent features and ground-truth labels. This approach integrates contrastive\nlearning and logit adjustment seamlessly to derive a loss function that shows\nstate-of-the-art performance on long-tailed recognition benchmarks. It also\ndemonstrates its efficacy in image segmentation tasks, verifying its\nversatility beyond image classification.\n","authors":["Min-Kook Suh","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2305.01160v3.pdf","comment":"ICML 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.04061v1","updated":"2023-08-08T05:48:38Z","published":"2023-08-08T05:48:38Z","title":"Enhancing Adversarial Robustness in Low-Label Regime via Adaptively\n  Weighted Regularization and Knowledge Distillation","summary":"  Adversarial robustness is a research area that has recently received a lot of\nattention in the quest for trustworthy artificial intelligence. However, recent\nworks on adversarial robustness have focused on supervised learning where it is\nassumed that labeled data is plentiful. In this paper, we investigate\nsemi-supervised adversarial training where labeled data is scarce. We derive\ntwo upper bounds for the robust risk and propose a regularization term for\nunlabeled data motivated by these two upper bounds. Then, we develop a\nsemi-supervised adversarial training algorithm that combines the proposed\nregularization term with knowledge distillation using a semi-supervised teacher\n(i.e., a teacher model trained using a semi-supervised learning algorithm). Our\nexperiments show that our proposed algorithm achieves state-of-the-art\nperformance with significant margins compared to existing algorithms. In\nparticular, compared to supervised learning algorithms, performance of our\nproposed algorithm is not much worse even when the amount of labeled data is\nvery small. For example, our algorithm with only 8\\% labeled data is comparable\nto supervised adversarial training algorithms that use all labeled data, both\nin terms of standard and robust accuracies on CIFAR-10.\n","authors":["Dongyoon Yang","Insung Kong","Yongdai Kim"],"pdf_url":"https://arxiv.org/pdf/2308.04061v1.pdf","comment":"9 pages - Manuscript, 6 pages - Appendix, Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04060v1","updated":"2023-08-08T05:46:03Z","published":"2023-08-08T05:46:03Z","title":"Toward Improving Predictive Risk Modelling for New Zealand's Child\n  Welfare System Using Clustering Methods","summary":"  The combination of clinical judgement and predictive risk models crucially\nassist social workers to segregate children at risk of maltreatment and decide\nwhen authorities should intervene. Predictive risk modelling to address this\nmatter has been initiated by several governmental welfare authorities worldwide\ninvolving administrative data and machine learning algorithms. While previous\nstudies have investigated risk factors relating to child maltreatment, several\ngaps remain as to understanding how such risk factors interact and whether\npredictive risk models perform differently for children with different\nfeatures. By integrating Principal Component Analysis and K-Means clustering,\nthis paper presents initial findings of our work on the identification of such\nfeatures as well as their potential effect on current risk modelling\nframeworks. This approach allows examining existent, unidentified yet, clusters\nof New Zealand (NZ) children reported with care and protection concerns, as\nwell as to analyse their inner structure, and evaluate the performance of\nprediction models trained cluster wise. We aim to discover the extent of\nclustering degree required as an early step in the development of predictive\nrisk models for child maltreatment and so enhance the accuracy of such models\nintended for use by child protection authorities. The results from testing\nLASSO logistic regression models trained on identified clusters revealed no\nsignificant difference in their performance. The models, however, performed\nslightly better for two clusters including younger children. our results\nsuggest that separate models might need to be developed for children of certain\nage to gain additional control over the error rates and to improve model\naccuracy. While results are promising, more evidence is needed to draw\ndefinitive conclusions, and further investigation is necessary.\n","authors":["Sahar Barmomanesh","Victor Miranda-Soberanis"],"pdf_url":"https://arxiv.org/pdf/2308.04060v1.pdf","comment":"20 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2206.11004v3","updated":"2023-08-08T05:31:03Z","published":"2022-06-22T12:07:50Z","title":"Auto-Encoding Adversarial Imitation Learning","summary":"  Reinforcement learning (RL) provides a powerful framework for\ndecision-making, but its application in practice often requires a carefully\ndesigned reward function. Adversarial Imitation Learning (AIL) sheds light on\nautomatic policy acquisition without access to the reward signal from the\nenvironment. In this work, we propose Auto-Encoding Adversarial Imitation\nLearning (AEAIL), a robust and scalable AIL framework. To induce expert\npolicies from demonstrations, AEAIL utilizes the reconstruction error of an\nauto-encoder as a reward signal, which provides more information for optimizing\npolicies than the prior discriminator-based ones. Subsequently, we use the\nderived objective functions to train the auto-encoder and the agent policy.\nExperiments show that our AEAIL performs superior compared to state-of-the-art\nmethods on both state and image based environments. More importantly, AEAIL\nshows much better robustness when the expert demonstrations are noisy.\n","authors":["Kaifeng Zhang","Rui Zhao","Ziming Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2206.11004v3.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2308.04051v1","updated":"2023-08-08T04:57:58Z","published":"2023-08-08T04:57:58Z","title":"Generative Models for Anomaly Detection and Design-Space Dimensionality\n  Reduction in Shape Optimization","summary":"  Our work presents a novel approach to shape optimization, that has the\ntwofold objective to improve the efficiency of global optimization algorithms\nwhile promoting the generation of high-quality designs during the optimization\nprocess free of geometrical anomalies. This is accomplished by reducing the\nnumber of the original design variables defining a new reduced subspace where\nthe geometrical variance is maximized and modeling the underlying generative\nprocess of the data via probabilistic linear latent variable models such as\nFactor Analysis and Probabilistic Principal Component Analysis. We show that\nthe data follows approximately a Gaussian distribution when the shape\nmodification method is linear and the design variables are sampled uniformly at\nrandom, due to the direct application of the central limit theorem. The model\nuncertainty is measured in terms of Mahalanobis distance, and the paper\ndemonstrates that anomalous designs tend to exhibit a high value of this\nmetric. This enables the definition of a new optimization model where anomalous\ngeometries are penalized and consequently avoided during the optimization loop.\nThe procedure is demonstrated for hull shape optimization of the DTMB 5415\nmodel, extensively used as an international benchmark for shape optimization\nproblems. The global optimization routine is carried out using Bayesian\nOptimization and the DIRECT algorithm. From the numerical results, the new\nframework improves the convergence of global optimization algorithms, while\nonly designs with high-quality geometrical features are generated through the\noptimization routine thereby avoiding the wastage of precious computationally\nexpensive simulations.\n","authors":["Danny D'Agostino"],"pdf_url":"https://arxiv.org/pdf/2308.04051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04037v1","updated":"2023-08-08T04:27:34Z","published":"2023-08-08T04:27:34Z","title":"A Comparative Study on TF-IDF feature Weighting Method and its Analysis\n  using Unstructured Dataset","summary":"  Text Classification is the process of categorizing text into the relevant\ncategories and its algorithms are at the core of many Natural Language\nProcessing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP\nare the most highly used information retrieval methods in text classification.\nWe have investigated and analyzed the feature weighting method for text\nclassification on unstructured data. The proposed model considered two features\nN-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset\nfor sentiment analysis. Then we have used the state-of-the-art classifier to\nvalidate the method i.e., Support Vector Machine (SVM), Logistic Regression,\nMultinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and\nk-nearest neighbors (KNN). From those two feature extractions, a significant\nincrease in feature extraction with TF-IDF features rather than based on\nN-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall\n(93.81%), and F1-score (91.99%) value in Random Forest classifier.\n","authors":["Mamata Das","Selvakumar K.","P. J. A. Alphonse"],"pdf_url":"https://arxiv.org/pdf/2308.04037v1.pdf","comment":"10 pages, 3 figures, COLINS-2021, 5th International Conference on\n  Computational Linguistics and Intelligent Systems, April 22-23, 2021,\n  Kharkiv, Ukraine"},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2308.04024v1","updated":"2023-08-08T03:40:05Z","published":"2023-08-08T03:40:05Z","title":"Scope Loss for Imbalanced Classification and RL Exploration","summary":"  We demonstrate equivalence between the reinforcement learning problem and the\nsupervised classification problem. We consequently equate the exploration\nexploitation trade-off in reinforcement learning to the dataset imbalance\nproblem in supervised classification, and find similarities in how they are\naddressed. From our analysis of the aforementioned problems we derive a novel\nloss function for reinforcement learning and supervised classification. Scope\nLoss, our new loss function, adjusts gradients to prevent performance losses\nfrom over-exploitation and dataset imbalances, without the need for any tuning.\nWe test Scope Loss against SOTA loss functions over a basket of benchmark\nreinforcement learning tasks and a skewed classification dataset, and show that\nScope Loss outperforms other loss functions.\n","authors":["Hasham Burhani","Xiao Qi Shi","Jonathan Jaegerman","Daniel Balicki"],"pdf_url":"https://arxiv.org/pdf/2308.04024v1.pdf","comment":"11 pages, 2 figures, under review for NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.04018v1","updated":"2023-08-08T03:29:43Z","published":"2023-08-08T03:29:43Z","title":"Improving Performance of Semi-Supervised Learning by Adversarial Attacks","summary":"  Semi-supervised learning (SSL) algorithm is a setup built upon a realistic\nassumption that access to a large amount of labeled data is tough. In this\nstudy, we present a generalized framework, named SCAR, standing for Selecting\nClean samples with Adversarial Robustness, for improving the performance of\nrecent SSL algorithms. By adversarially attacking pre-trained models with\nsemi-supervision, our framework shows substantial advances in classifying\nimages. We introduce how adversarial attacks successfully select high-confident\nunlabeled data to be labeled with current predictions. On CIFAR10, three recent\nSSL algorithms with SCAR result in significantly improved image classification.\n","authors":["Dongyoon Yang","Kunwoong Kim","Yongdai Kim"],"pdf_url":"https://arxiv.org/pdf/2308.04018v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2308.04014v1","updated":"2023-08-08T03:18:18Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n  model?","summary":"  Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04011v1","updated":"2023-08-08T03:14:34Z","published":"2023-08-08T03:14:34Z","title":"Generalization bound for estimating causal effects from observational\n  network data","summary":"  Estimating causal effects from observational network data is a significant\nbut challenging problem. Existing works in causal inference for observational\nnetwork data lack an analysis of the generalization bound, which can\ntheoretically provide support for alleviating the complex confounding bias and\npractically guide the design of learning objectives in a principled manner. To\nfill this gap, we derive a generalization bound for causal effect estimation in\nnetwork scenarios by exploiting 1) the reweighting schema based on joint\npropensity score and 2) the representation learning schema based on Integral\nProbability Metric (IPM). We provide two perspectives on the generalization\nbound in terms of reweighting and representation learning, respectively.\nMotivated by the analysis of the bound, we propose a weighting regression\nmethod based on the joint propensity score augmented with representation\nlearning. Extensive experimental studies on two real-world networks with\nsemi-synthetic data demonstrate the effectiveness of our algorithm.\n","authors":["Ruichu Cai","Zeqin Yang","Weilin Chen","Yuguang Yan","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2308.04011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03713v2","updated":"2023-08-08T02:53:52Z","published":"2023-08-07T16:32:14Z","title":"Communication-Efficient Framework for Distributed Image Semantic\n  Wireless Transmission","summary":"  Multi-node communication, which refers to the interaction among multiple\ndevices, has attracted lots of attention in many Internet-of-Things (IoT)\nscenarios. However, its huge amounts of data flows and inflexibility for task\nextension have triggered the urgent requirement of communication-efficient\ndistributed data transmission frameworks. In this paper, inspired by the great\nsuperiorities on bandwidth reduction and task adaptation of semantic\ncommunications, we propose a federated learning-based semantic communication\n(FLSC) framework for multi-task distributed image transmission with IoT\ndevices. Federated learning enables the design of independent semantic\ncommunication link of each user while further improves the semantic extraction\nand task performance through global aggregation. Each link in FLSC is composed\nof a hierarchical vision transformer (HVT)-based extractor and a task-adaptive\ntranslator for coarse-to-fine semantic extraction and meaning translation\naccording to specific tasks. In order to extend the FLSC into more realistic\nconditions, we design a channel state information-based multiple-input\nmultiple-output transmission module to combat channel fading and noise.\nSimulation results show that the coarse semantic information can deal with a\nrange of image-level tasks. Moreover, especially in low signal-to-noise ratio\nand channel bandwidth ratio regimes, FLSC evidently outperforms the traditional\nscheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel\ncondition.\n","authors":["Bingyan Xie","Yongpeng Wu","Yuxuan Shi","Derrick Wing Kwan Ng","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03713v2.pdf","comment":"This paper has been accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2109.03159v6","updated":"2023-08-08T02:52:17Z","published":"2021-09-07T15:51:12Z","title":"Analysis of Regularized Learning for Linear-functional Data in Banach\n  Spaces","summary":"  In this article, we study the whole theory of regularized learning for\nlinear-functional data in Banach spaces including representer theorems,\npseudo-approximation theorems, and convergence theorems. The input training\ndata are composed of linear functionals in the predual space of the Banach\nspace to represent the discrete local information of multimodel data and\nmultiscale models. The training data and the multi-loss functions are used to\ncompute the empirical risks to approximate the expected risks, and the\nregularized learning is to minimize the regularized empirical risks over the\nBanach spaces. The exact solutions of the original problems are approximated\nglobally by the regularized learning even if the original problems are unknown\nor unformulated. In the convergence theorems, we show the convergence of the\napproximate solutions to the exact solutions by the weak* topology of the\nBanach space. Moreover, the theorems of the regularized learning are applied to\nsolve many problems of machine learning such as support vector machines and\nartificial neural networks.\n","authors":["Qi Ye"],"pdf_url":"https://arxiv.org/pdf/2109.03159v6.pdf","comment":"53 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.03999v1","updated":"2023-08-08T02:28:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, de-mystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03995v1","updated":"2023-08-08T02:15:06Z","published":"2023-08-08T02:15:06Z","title":"Cooperative Multi-Type Multi-Agent Deep Reinforcement Learning for\n  Resource Management in Space-Air-Ground Integrated Networks","summary":"  The Space-Air-Ground Integrated Network (SAGIN), integrating heterogeneous\ndevices including low earth orbit (LEO) satellites, unmanned aerial vehicles\n(UAVs), and ground users (GUs), holds significant promise for advancing smart\ncity applications. However, resource management of the SAGIN is a challenge\nrequiring urgent study in that inappropriate resource management will cause\npoor data transmission, and hence affect the services in smart cities. In this\npaper, we develop a comprehensive SAGIN system that encompasses five distinct\ncommunication links and propose an efficient cooperative multi-type multi-agent\ndeep reinforcement learning (CMT-MARL) method to address the resource\nmanagement issue. The experimental results highlight the efficacy of the\nproposed CMT-MARL, as evidenced by key performance indicators such as the\noverall transmission rate and transmission success rate. These results\nunderscore the potential value and feasibility of future implementation of the\nSAGIN.\n","authors":["Hengxi Zhang","Huaze Tang","Wenbo Ding","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00195v2","updated":"2023-08-08T02:06:23Z","published":"2023-02-01T02:58:29Z","title":"Weight Prediction Boosts the Convergence of AdamW","summary":"  In this paper, we introduce weight prediction into the AdamW optimizer to\nboost its convergence when training the deep neural network (DNN) models. In\nparticular, ahead of each mini-batch training, we predict the future weights\naccording to the update rule of AdamW and then apply the predicted future\nweights to do both forward pass and backward propagation. In this way, the\nAdamW optimizer always utilizes the gradients w.r.t. the future weights instead\nof current weights to update the DNN parameters, making the AdamW optimizer\nachieve better convergence. Our proposal is simple and straightforward to\nimplement but effective in boosting the convergence of DNN training. We\nperformed extensive experimental evaluations on image classification and\nlanguage modeling tasks to verify the effectiveness of our proposal. The\nexperimental results validate that our proposal can boost the convergence of\nAdamW and achieve better accuracy than AdamW when training the DNN models.\n","authors":["Lei Guan"],"pdf_url":"https://arxiv.org/pdf/2302.00195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03985v1","updated":"2023-08-08T02:03:47Z","published":"2023-08-08T02:03:47Z","title":"Fourier neural operator for real-time simulation of 3D dynamic urban\n  microclimate","summary":"  Global urbanization has underscored the significance of urban microclimates\nfor human comfort, health, and building/urban energy efficiency. They\nprofoundly influence building design and urban planning as major environmental\nimpacts. Understanding local microclimates is essential for cities to prepare\nfor climate change and effectively implement resilience measures. However,\nanalyzing urban microclimates requires considering a complex array of outdoor\nparameters within computational domains at the city scale over a longer period\nthan indoors. As a result, numerical methods like Computational Fluid Dynamics\n(CFD) become computationally expensive when evaluating the impact of urban\nmicroclimates. The rise of deep learning techniques has opened new\nopportunities for accelerating the modeling of complex non-linear interactions\nand system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown\nto be very promising in accelerating solving the Partial Differential Equations\n(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO\nnetwork for real-time three-dimensional (3D) urban wind field simulation. The\ntraining and testing data are generated from CFD simulation of the urban area,\nbased on the semi-Lagrangian approach and fractional stepping method to\nsimulate urban microclimate features for modeling large-scale urban problems.\nNumerical experiments show that the FNO model can accurately reconstruct the\ninstantaneous spatial velocity field. We further evaluate the trained FNO model\non unseen data with different wind directions, and the results show that the\nFNO model can generalize well on different wind directions. More importantly,\nthe FNO approach can make predictions within milliseconds on the graphics\nprocessing unit, making real-time simulation of 3D dynamic urban microclimate\npossible.\n","authors":["Wenhui Peng","Shaoxiang Qin","Senwen Yang","Jianchun Wang","Xue Liu"," Liangzhu"," Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12450v2","updated":"2023-08-08T01:42:17Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v2.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2308.03977v1","updated":"2023-08-08T01:33:13Z","published":"2023-08-08T01:33:13Z","title":"PUG: Photorealistic and Semantically Controllable Synthetic Data for\n  Representation Learning","summary":"  Synthetic image datasets offer unmatched advantages for designing and\nevaluating deep neural networks: they make it possible to (i) render as many\ndata samples as needed, (ii) precisely control each scene and yield granular\nground truth labels (and captions), (iii) precisely control distribution shifts\nbetween training and testing to isolate variables of interest for sound\nexperimentation. Despite such promise, the use of synthetic image data is still\nlimited -- and often played down -- mainly due to their lack of realism. Most\nworks therefore rely on datasets of real images, which have often been scraped\nfrom public images on the internet, and may have issues with regards to\nprivacy, bias, and copyright, while offering little control over how objects\nprecisely appear. In this work, we present a path to democratize the use of\nphotorealistic synthetic data: we develop a new generation of interactive\nenvironments for representation learning research, that offer both\ncontrollability and realism. We use the Unreal Engine, a powerful game engine\nwell known in the entertainment industry, to produce PUG (Photorealistic Unreal\nGraphics) environments and datasets for representation learning. In this paper,\nwe demonstrate the potential of PUG to enable more rigorous evaluations of\nvision models.\n","authors":["Florian Bordes","Shashank Shekhar","Mark Ibrahim","Diane Bouchacourt","Pascal Vincent","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10579v3","updated":"2023-08-08T01:32:52Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":"  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v3.pdf","comment":"FL-IJCAI'23"},{"id":"http://arxiv.org/abs/2303.16459v2","updated":"2023-08-08T01:06:09Z","published":"2023-03-29T05:08:21Z","title":"GNNBuilder: An Automated Framework for Generic Graph Neural Network\n  Accelerator Generation, Simulation, and Optimization","summary":"  There are plenty of graph neural network (GNN) accelerators being proposed.\nHowever, they highly rely on users' hardware expertise and are usually\noptimized for one specific GNN model, making them challenging for practical\nuse. Therefore, in this work, we propose GNNBuilder, the first automated,\ngeneric, end-to-end GNN accelerator generation framework. It features four\nadvantages: (1) GNNBuilder can automatically generate GNN accelerators for a\nwide range of GNN models arbitrarily defined by users; (2) GNNBuilder takes\nstandard PyTorch programming interface, introducing zero overhead for algorithm\ndevelopers; (3) GNNBuilder supports end-to-end code generation, simulation,\naccelerator optimization, and hardware deployment, realizing a push-button\nfashion for GNN accelerator design; (4) GNNBuilder is equipped with accurate\nperformance models of its generated accelerator, enabling fast and flexible\ndesign space exploration (DSE). In the experiments, first, we show that our\naccelerator performance model has errors within $36\\%$ for latency prediction\nand $18\\%$ for BRAM count prediction. Second, we show that our generated\naccelerators can outperform CPU by $6.33\\times$ and GPU by $6.87\\times$. This\nframework is open-source, and the code is available at\nhttps://github.com/sharc-lab/gnn-builder.\n","authors":["Stefan Abi-Karam","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2303.16459v2.pdf","comment":"10 pages, 7 figures, 4 tables, 3 listings"},{"id":"http://arxiv.org/abs/2306.01951v4","updated":"2023-08-08T23:26:19Z","published":"2023-06-02T23:23:34Z","title":"GAD-NR: Graph Anomaly Detection via Neighborhood Reconstruction","summary":"  Graph Anomaly Detection (GAD) is a technique used to identify abnormal nodes\nwithin graphs, finding applications in network security, fraud detection,\nsocial media spam detection, and various other domains. A common method for GAD\nis Graph Auto-Encoders (GAEs), which encode graph data into node\nrepresentations and identify anomalies by assessing the reconstruction quality\nof the graphs based on these representations. However, existing GAE models are\nprimarily optimized for direct link reconstruction, resulting in nodes\nconnected in the graph being clustered in the latent space. As a result, they\nexcel at detecting cluster-type structural anomalies but struggle with more\ncomplex structural anomalies that do not conform to clusters. To address this\nlimitation, we propose a novel solution called GAD-NR, a new variant of GAE\nthat incorporates neighborhood reconstruction for graph anomaly detection.\nGAD-NR aims to reconstruct the entire neighborhood of a node, encompassing the\nlocal structure, self-attributes, and neighbor attributes, based on the\ncorresponding node representation. By comparing the neighborhood reconstruction\nloss between anomalous nodes and normal nodes, GAD-NR can effectively detect\nany anomalies. Extensive experimentation conducted on six real-world datasets\nvalidates the effectiveness of GAD-NR, showcasing significant improvements (by\nup to 30% in AUC) over state-of-the-art competitors. The source code for GAD-NR\nis openly available. Importantly, the comparative analysis reveals that the\nexisting methods perform well only in detecting one or two types of anomalies\nout of the three types studied. In contrast, GAD-NR excels at detecting all\nthree types of anomalies across the datasets, demonstrating its comprehensive\nanomaly detection capabilities.\n","authors":["Amit Roy","Juan Shu","Jia Li","Carl Yang","Olivier Elshocht","Jeroen Smeets","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2306.01951v4.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2304.08424v3","updated":"2023-08-08T23:22:19Z","published":"2023-04-17T16:46:48Z","title":"Long-term Forecasting with TiDE: Time-series Dense Encoder","summary":"  Recent work has shown that simple linear models can outperform several\nTransformer based approaches in long term time-series forecasting. Motivated by\nthis, we propose a Multi-layer Perceptron (MLP) based encoder-decoder model,\nTime-series Dense Encoder (TiDE), for long-term time-series forecasting that\nenjoys the simplicity and speed of linear models while also being able to\nhandle covariates and non-linear dependencies. Theoretically, we prove that the\nsimplest linear analogue of our model can achieve near optimal error rate for\nlinear dynamical systems (LDS) under some assumptions. Empirically, we show\nthat our method can match or outperform prior approaches on popular long-term\ntime-series forecasting benchmarks while being 5-10x faster than the best\nTransformer based model.\n","authors":["Abhimanyu Das","Weihao Kong","Andrew Leach","Shaan Mathur","Rajat Sen","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2304.08424v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04620v1","updated":"2023-08-08T22:54:47Z","published":"2023-08-08T22:54:47Z","title":"Multiclass Online Learnability under Bandit Feedback","summary":"  We study online multiclass classification under bandit feedback. We extend\nthe results of (daniely2013price) by showing that the finiteness of the Bandit\nLittlestone dimension is necessary and sufficient for bandit online multiclass\nlearnability even when the label space is unbounded. Our result complements the\nrecent work by (hanneke2023multiclass) who show that the Littlestone dimension\ncharacterizes online multiclass learnability in the full-information setting\nwhen the label space is unbounded.\n","authors":["Ananth Raman","Vinod Raman","Unique Subedi","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2308.04620v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.04617v1","updated":"2023-08-08T22:47:39Z","published":"2023-08-08T22:47:39Z","title":"Improved Activation Clipping for Universal Backdoor Mitigation and\n  Test-Time Detection","summary":"  Deep neural networks are vulnerable to backdoor attacks (Trojans), where an\nattacker poisons the training set with backdoor triggers so that the neural\nnetwork learns to classify test-time triggers to the attacker's designated\ntarget class. Recent work shows that backdoor poisoning induces over-fitting\n(abnormally large activations) in the attacked model, which motivates a\ngeneral, post-training clipping method for backdoor mitigation, i.e., with\nbounds on internal-layer activations learned using a small set of clean\nsamples. We devise a new such approach, choosing the activation bounds to\nexplicitly limit classification margins. This method gives superior performance\nagainst peer methods for CIFAR-10 image classification. We also show that this\nmethod has strong robustness against adaptive attacks, X2X attacks, and on\ndifferent datasets. Finally, we demonstrate a method extension for test-time\ndetection and correction based on the output differences between the original\nand activation-bounded networks. The code of our method is online available.\n","authors":["Hang Wang","Zhen Xiang","David J. Miller","George Kesidis"],"pdf_url":"https://arxiv.org/pdf/2308.04617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04616v1","updated":"2023-08-08T22:47:12Z","published":"2023-08-08T22:47:12Z","title":"Machine Learning, Deep Learning and Data Preprocessing Techniques for\n  Detection, Prediction, and Monitoring of Stress and Stress-related Mental\n  Disorders: A Scoping Review","summary":"  This comprehensive review systematically evaluates Machine Learning (ML)\nmethodologies employed in the detection, prediction, and analysis of mental\nstress and its consequent mental disorders (MDs). Utilizing a rigorous scoping\nreview process, the investigation delves into the latest ML algorithms,\npreprocessing techniques, and data types employed in the context of stress and\nstress-related MDs. The findings highlight that Support Vector Machine (SVM),\nNeural Network (NN), and Random Forest (RF) models consistently exhibit\nsuperior accuracy and robustness among all machine learning algorithms\nexamined. Furthermore, the review underscores that physiological parameters,\nsuch as heart rate measurements and skin response, are prevalently used as\nstress predictors in ML algorithms. This is attributed to their rich\nexplanatory information concerning stress and stress-related MDs, as well as\nthe relative ease of data acquisition. Additionally, the application of\ndimensionality reduction techniques, including mappings, feature selection,\nfiltering, and noise reduction, is frequently observed as a crucial step\npreceding the training of ML algorithms. The synthesis of this review\nidentifies significant research gaps and outlines future directions for the\nfield. These encompass areas such as model interpretability, model\npersonalization, the incorporation of naturalistic settings, and real-time\nprocessing capabilities for detection and prediction of stress and\nstress-related MDs.\n","authors":["Moein Razavi","Samira Ziyadidegan","Reza Jahromi","Saber Kazeminasab","Vahid Janfaza","Ahmadreza Mahmoudzadeh","Elaheh Baharlouei","Farzan Sasangohar"],"pdf_url":"https://arxiv.org/pdf/2308.04616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04615v1","updated":"2023-08-08T22:45:48Z","published":"2023-08-08T22:45:48Z","title":"Sparse Array Design for Direction Finding using Deep Learning","summary":"  In the past few years, deep learning (DL) techniques have been introduced for\ndesigning sparse arrays. These methods offer the advantages of feature\nengineering and low prediction-stage complexity, which is helpful in tackling\nthe combinatorial search inherent to finding a sparse array. In this chapter,\nwe provide a synopsis of several direction finding applications of DL-based\nsparse arrays. We begin by examining supervised and transfer learning\ntechniques that have applications in selecting sparse arrays for a cognitive\nradar application. Here, we also discuss the use of meta-heuristic learning\nalgorithms such as simulated annealing for the case of designing\ntwo-dimensional sparse arrays. Next, we consider DL-based antenna selection for\nwireless communications, wherein sparse array problem may also be combined with\nchannel estimation, beamforming, or localization. Finally, we provide an\nexample of deep sparse array technique for integrated sensing and\ncommunications (ISAC) application, wherein a trade-off of radar and\ncommunications performance makes ISAC sparse array problem very challenging.\nFor each setting, we illustrate the performance of model-based optimization and\nDL techniques through several numerical experiments. We discuss additional\nconsiderations required to ensure robustness of DL-based algorithms against\nvarious imperfections in array data.\n","authors":["Kumar Vijay Mishra","Ahmet M. Elbir","Koichi Ichige"],"pdf_url":"https://arxiv.org/pdf/2308.04615v1.pdf","comment":"Book chapter, 39 pages, 20 figures, 4 tables. arXiv admin note:\n  substantial text overlap with arXiv:2004.11637"},{"id":"http://arxiv.org/abs/2007.07461v3","updated":"2023-08-08T22:36:08Z","published":"2020-07-15T03:25:24Z","title":"Model-Based Multi-Agent RL in Zero-Sum Markov Games with Near-Optimal\n  Sample Complexity","summary":"  Model-based reinforcement learning (RL), which finds an optimal policy using\nan empirical model, has long been recognized as one of the corner stones of RL.\nIt is especially suitable for multi-agent RL (MARL), as it naturally decouples\nthe learning and the planning phases, and avoids the non-stationarity problem\nwhen all agents are improving their policies simultaneously using samples.\nThough intuitive and widely-used, the sample complexity of model-based MARL\nalgorithms has not been fully investigated. In this paper, our goal is to\naddress the fundamental question about its sample complexity. We study arguably\nthe most basic MARL setting: two-player discounted zero-sum Markov games, given\nonly access to a generative model. We show that model-based MARL achieves a\nsample complexity of $\\tilde O(|S||A||B|(1-\\gamma)^{-3}\\epsilon^{-2})$ for\nfinding the Nash equilibrium (NE) value up to some $\\epsilon$ error, and the\n$\\epsilon$-NE policies with a smooth planning oracle, where $\\gamma$ is the\ndiscount factor, and $S,A,B$ denote the state space, and the action spaces for\nthe two agents. We further show that such a sample bound is minimax-optimal (up\nto logarithmic factors) if the algorithm is reward-agnostic, where the\nalgorithm queries state transition samples without reward knowledge, by\nestablishing a matching lower bound. This is in contrast to the usual\nreward-aware setting, with a\n$\\tilde\\Omega(|S|(|A|+|B|)(1-\\gamma)^{-3}\\epsilon^{-2})$ lower bound, where\nthis model-based approach is near-optimal with only a gap on the $|A|,|B|$\ndependence. Our results not only demonstrate the sample-efficiency of this\nbasic model-based approach in MARL, but also elaborate on the fundamental\ntradeoff between its power (easily handling the more challenging\nreward-agnostic case) and limitation (less adaptive and suboptimal in\n$|A|,|B|$), particularly arises in the multi-agent context.\n","authors":["Kaiqing Zhang","Sham M. Kakade","Tamer Başar","Lin F. Yang"],"pdf_url":"https://arxiv.org/pdf/2007.07461v3.pdf","comment":"Updated version accepted to Journal of Machine Learning Research\n  (JMLR)"},{"id":"http://arxiv.org/abs/2307.03364v3","updated":"2023-08-08T22:32:24Z","published":"2023-07-07T03:07:28Z","title":"Distilled Pruning: Using Synthetic Data to Win the Lottery","summary":"  This work introduces a novel approach to pruning deep learning models by\nusing distilled data. Unlike conventional strategies which primarily focus on\narchitectural or algorithmic optimization, our method reconsiders the role of\ndata in these scenarios. Distilled datasets capture essential patterns from\nlarger datasets, and we demonstrate how to leverage this capability to enable a\ncomputationally efficient pruning process. Our approach can find sparse,\ntrainable subnetworks (a.k.a. Lottery Tickets) up to 5x faster than Iterative\nMagnitude Pruning at comparable sparsity on CIFAR-10. The experimental results\nhighlight the potential of using distilled data for resource-efficient neural\nnetwork pruning, model compression, and neural architecture search.\n","authors":["Luke McDermott","Daniel Cummings"],"pdf_url":"https://arxiv.org/pdf/2307.03364v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04611v1","updated":"2023-08-08T22:30:47Z","published":"2023-08-08T22:30:47Z","title":"Deep Learning Driven Detection of Tsunami Related Internal GravityWaves:\n  a path towards open-ocean natural hazards detection","summary":"  Tsunamis can trigger internal gravity waves (IGWs) in the ionosphere,\nperturbing the Total Electron Content (TEC) - referred to as Traveling\nIonospheric Disturbances (TIDs) that are detectable through the Global\nNavigation Satellite System (GNSS). The GNSS are constellations of satellites\nproviding signals from Earth orbit - Europe's Galileo, the United States'\nGlobal Positioning System (GPS), Russia's Global'naya Navigatsionnaya\nSputnikovaya Sistema (GLONASS) and China's BeiDou. The real-time detection of\nTIDs provides an approach for tsunami detection, enhancing early warning\nsystems by providing open-ocean coverage in geographic areas not serviceable by\nbuoy-based warning systems. Large volumes of the GNSS data is leveraged by deep\nlearning, which effectively handles complex non-linear relationships across\nthousands of data streams. We describe a framework leveraging slant total\nelectron content (sTEC) from the VARION (Variometric Approach for Real-Time\nIonosphere Observation) algorithm by Gramian Angular Difference Fields (from\nComputer Vision) and Convolutional Neural Networks (CNNs) to detect TIDs in\nnear-real-time. Historical data from the 2010 Maule, 2011 Tohoku and the 2012\nHaida-Gwaii earthquakes and tsunamis are used in model training, and the\nlater-occurring 2015 Illapel earthquake and tsunami in Chile for out-of-sample\nmodel validation. Using the experimental framework described in the paper, we\nachieved a 91.7% F1 score. Source code is available at:\nhttps://github.com/vc1492a/tidd. Our work represents a new frontier in\ndetecting tsunami-driven IGWs in open-ocean, dramatically improving the\npotential for natural hazards detection for coastal communities.\n","authors":["Valentino Constantinou","Michela Ravanelli","Hamlin Liu","Jacob Bortnik"],"pdf_url":"https://arxiv.org/pdf/2308.04611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04605v1","updated":"2023-08-08T22:10:29Z","published":"2023-08-08T22:10:29Z","title":"PSRFlow: Probabilistic Super Resolution with Flow-Based Models for\n  Scientific Data","summary":"  Although many deep-learning-based super-resolution approaches have been\nproposed in recent years, because no ground truth is available in the inference\nstage, few can quantify the errors and uncertainties of the super-resolved\nresults. For scientific visualization applications, however, conveying\nuncertainties of the results to scientists is crucial to avoid generating\nmisleading or incorrect information. In this paper, we propose PSRFlow, a novel\nnormalizing flow-based generative model for scientific data super-resolution\nthat incorporates uncertainty quantification into the super-resolution process.\nPSRFlow learns the conditional distribution of the high-resolution data based\non the low-resolution counterpart. By sampling from a Gaussian latent space\nthat captures the missing information in the high-resolution data, one can\ngenerate different plausible super-resolution outputs. The efficient sampling\nin the Gaussian latent space allows our model to perform uncertainty\nquantification for the super-resolved results. During model training, we\naugment the training data with samples across various scales to make the model\nadaptable to data of different scales, achieving flexible super-resolution for\na given input. Our results demonstrate superior performance and robust\nuncertainty quantification compared with existing methods such as interpolation\nand GAN-based super-resolution networks.\n","authors":["Jingyi Shen","Han-Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04605v1.pdf","comment":"To be published in Proc. IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.04604v1","updated":"2023-08-08T22:07:15Z","published":"2023-08-08T22:07:15Z","title":"A Survey on Decentralized Federated Learning","summary":"  In recent years, federated learning (FL) has become a very popular paradigm\nfor training distributed, large-scale, and privacy-preserving machine learning\n(ML) systems. In contrast to standard ML, where data must be collected at the\nexact location where training is performed, FL takes advantage of the\ncomputational capabilities of millions of edge devices to collaboratively train\na shared, global model without disclosing their local private data.\nSpecifically, in a typical FL system, the central server acts only as an\norchestrator; it iteratively gathers and aggregates all the local models\ntrained by each client on its private data until convergence. Although FL\nundoubtedly has several benefits over traditional ML (e.g., it protects private\ndata ownership by design), it suffers from several weaknesses. One of the most\ncritical challenges is to overcome the centralized orchestration of the\nclassical FL client-server architecture, which is known to be vulnerable to\nsingle-point-of-failure risks and man-in-the-middle attacks, among others. To\nmitigate such exposure, decentralized FL solutions have emerged where all FL\nclients cooperate and communicate without a central server. This survey\ncomprehensively summarizes and reviews existing decentralized FL approaches\nproposed in the literature. Furthermore, it identifies emerging challenges and\nsuggests promising research directions in this under-explored domain.\n","authors":["Edoardo Gabrielli","Giovanni Pica","Gabriele Tolomei"],"pdf_url":"https://arxiv.org/pdf/2308.04604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04603v1","updated":"2023-08-08T22:06:14Z","published":"2023-08-08T22:06:14Z","title":"Deep Learning based Image Watermarking: A Brief Survey","summary":"  The act of secretly embedding and extracting a watermark on a cover image to\nprotect it is known as image watermarking. In recent years, deep learning-based\nimage watermarking techniques have been emerging one after another. To study\nthe state-of-the-art, this survey categorizes cutting-edge deep learning-based\nimage watermarking techniques into Embedder-Extractor Joint Training, Deep\nNetworks as a Feature Transformation, and Hybrid schemes. Research directions\nin each category are also analyzed and summarized. Additionally, potential\nfuture research directions are discussed to envision future studies.\n","authors":["Xin Zhong","Arjon Das","Fahad Alrasheedi","Abdullah Tanvir"],"pdf_url":"https://arxiv.org/pdf/2308.04603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04595v1","updated":"2023-08-08T21:38:02Z","published":"2023-08-08T21:38:02Z","title":"Quantization Aware Factorization for Deep Neural Network Compression","summary":"  Tensor decomposition of convolutional and fully-connected layers is an\neffective way to reduce parameters and FLOP in neural networks. Due to memory\nand power consumption limitations of mobile or embedded devices, the\nquantization step is usually necessary when pre-trained models are deployed. A\nconventional post-training quantization approach applied to networks with\ndecomposed weights yields a drop in accuracy. This motivated us to develop an\nalgorithm that finds tensor approximation directly with quantized factors and\nthus benefit from both compression techniques while keeping the prediction\nquality of the model. Namely, we propose to use Alternating Direction Method of\nMultipliers (ADMM) for Canonical Polyadic (CP) decomposition with factors whose\nelements lie on a specified quantization grid. We compress neural network\nweights with a devised algorithm and evaluate it's prediction quality and\nperformance. We compare our approach to state-of-the-art post-training\nquantization methods and demonstrate competitive results and high flexibility\nin achiving a desirable quality-performance tradeoff.\n","authors":["Daria Cherniuk","Stanislav Abukhovich","Anh-Huy Phan","Ivan Oseledets","Andrzej Cichocki","Julia Gusak"],"pdf_url":"https://arxiv.org/pdf/2308.04595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04588v1","updated":"2023-08-08T21:17:03Z","published":"2023-08-08T21:17:03Z","title":"ScatterUQ: Interactive Uncertainty Visualizations for Multiclass Deep\n  Learning Problems","summary":"  Recently, uncertainty-aware deep learning methods for multiclass labeling\nproblems have been developed that provide calibrated class prediction\nprobabilities and out-of-distribution (OOD) indicators, letting machine\nlearning (ML) consumers and engineers gauge a model's confidence in its\npredictions. However, this extra neural network prediction information is\nchallenging to scalably convey visually for arbitrary data sources under\nmultiple uncertainty contexts. To address these challenges, we present\nScatterUQ, an interactive system that provides targeted visualizations to allow\nusers to better understand model performance in context-driven uncertainty\nsettings. ScatterUQ leverages recent advances in distance-aware neural\nnetworks, together with dimensionality reduction techniques, to construct\nrobust, 2-D scatter plots explaining why a model predicts a test example to be\n(1) in-distribution and of a particular class, (2) in-distribution but unsure\nof the class, and (3) out-of-distribution. ML consumers and engineers can\nvisually compare the salient features of test samples with training examples\nthrough the use of a ``hover callback'' to understand model uncertainty\nperformance and decide follow up courses of action. We demonstrate the\neffectiveness of ScatterUQ to explain model uncertainty for a multiclass image\nclassification on a distance-aware neural network trained on Fashion-MNIST and\ntested on Fashion-MNIST (in distribution) and MNIST digits (out of\ndistribution), as well as a deep learning model for a cyber dataset. We\nquantitatively evaluate dimensionality reduction techniques to optimize our\ncontextually driven UQ visualizations. Our results indicate that the ScatterUQ\nsystem should scale to arbitrary, multiclass datasets. Our code is available at\nhttps://github.com/mit-ll-responsible-ai/equine-webapp\n","authors":["Harry Li","Steven Jorgensen","John Holodnak","Allan Wollaber"],"pdf_url":"https://arxiv.org/pdf/2308.04588v1.pdf","comment":"5 pages, 4 figures, accepted to IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.04585v1","updated":"2023-08-08T21:11:06Z","published":"2023-08-08T21:11:06Z","title":"Kernel Single Proxy Control for Deterministic Confounding","summary":"  We consider the problem of causal effect estimation with an unobserved\nconfounder, where we observe a proxy variable that is associated with the\nconfounder. Although Proxy Causal Learning (PCL) uses two proxy variables to\nrecover the true causal effect, we show that a single proxy variable is\nsufficient for causal estimation if the outcome is generated deterministically,\ngeneralizing Control Outcome Calibration Approach (COCA). We propose two\nkernel-based methods for this setting: the first based on the two-stage\nregression approach, and the second based on a maximum moment restriction\napproach. We prove that both approaches can consistently estimate the causal\neffect, and we empirically demonstrate that we can successfully recover the\ncausal effect on a synthetic dataset.\n","authors":["Liyuan Xu","Arthur Gretton"],"pdf_url":"https://arxiv.org/pdf/2308.04585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04579v1","updated":"2023-08-08T20:54:59Z","published":"2023-08-08T20:54:59Z","title":"RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose\n  Recommendation System?","summary":"  Over the past two decades, recommendation systems (RSs) have used machine\nlearning (ML) solutions to recommend items, e.g., movies, books, and\nrestaurants, to clients of a business or an online platform. Recipe\nrecommendation, however, has not yet received much attention compared to those\napplications. We introduce RECipe as a multi-purpose recipe recommendation\nframework with a multi-modal knowledge graph (MMKG) backbone. The motivation\nbehind RECipe is to go beyond (deep) neural collaborative filtering (NCF) by\nrecommending recipes to users when they query in natural language or by\nproviding an image. RECipe consists of 3 subsystems: (1) behavior-based\nrecommender, (2) review-based recommender, and (3) image-based recommender.\nEach subsystem relies on the embedding representations of entities and\nrelations in the graph. We first obtain (pre-trained) embedding representations\nof textual entities, such as reviews or ingredients, from a fine-tuned model of\nMicrosoft's MPNet. We initialize the weights of the entities with these\nembeddings to train our knowledge graph embedding (KGE) model. For the visual\ncomponent, i.e., recipe images, we develop a KGE-Guided variational autoencoder\n(KG-VAE) to learn the distribution of images and their latent representations.\nOnce KGE and KG-VAE models are fully trained, we use them as a multi-purpose\nrecommendation framework. For benchmarking, we created two knowledge graphs\n(KGs) from public datasets on Kaggle for recipe recommendation. Our experiments\nshow that the KGE models have comparable performance to the neural solutions.\nWe also present pre-trained NLP embeddings to address important applications\nsuch as zero-shot inference for new users (or the cold start problem) and\nconditional recommendation with respect to recipe categories. We eventually\ndemonstrate the application of RECipe in a multi-purpose recommendation\nsetting.\n","authors":["Ali Pesaranghader","Touqir Sajed"],"pdf_url":"https://arxiv.org/pdf/2308.04579v1.pdf","comment":"19 pages, 8 figures, 8 tables"},{"id":"http://arxiv.org/abs/2305.05738v2","updated":"2023-08-08T20:23:37Z","published":"2023-05-09T19:33:17Z","title":"DOCTOR: A Multi-Disease Detection Continual Learning Framework Based on\n  Wearable Medical Sensors","summary":"  Modern advances in machine learning (ML) and wearable medical sensors (WMSs)\nin edge devices have enabled ML-driven disease detection for smart healthcare.\nConventional ML-driven disease detection methods rely on customizing individual\nmodels for each disease and its corresponding WMS data. However, such methods\nlack adaptability to distribution shifts and new task classification classes.\nMoreover, they need to be rearchitected and retrained from scratch for each new\ndisease. To address these challenges, we propose DOCTOR, a multi-disease\ndetection continual learning (CL) framework based on WMSs. It employs a\nmulti-headed deep neural network (DNN) and an exemplar-replay-style CL\nalgorithm. The CL algorithm enables the framework to continually learn new\nmissions where different data distributions, classification classes, and\ndisease detection tasks are introduced sequentially. It counteracts\ncatastrophic forgetting with a data preservation method and a synthetic data\ngeneration (SDG) module. The data preservation method efficiently preserves the\nmost informative subset of training data from previous missions for replay. The\nSDG module models the probability distribution of the real training data and\ngenerates synthetic data for replays while retaining data privacy. The\nmulti-headed DNN enables DOCTOR to detect multiple diseases simultaneously\nbased on user WMS data. In various CL experiments, we demonstrate DOCTOR's\nefficacy in maintaining high disease classification accuracy with a single DNN\nmodel. DOCTOR achieves 1.43 times better average test accuracy, 1.25 times\nbetter F1-score, and 0.41 higher backward transfer than the naive fine-tuning\nframework, with a small model size and in complex CL scenarios.\n","authors":["Chia-Hao Li","Niraj K. Jha"],"pdf_url":"https://arxiv.org/pdf/2305.05738v2.pdf","comment":"29 pages, 7 figures. This work has been submitted to the IEEE for\n  possible publication. Copyright may be transferred without notice, after\n  which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2308.04553v1","updated":"2023-08-08T19:52:28Z","published":"2023-08-08T19:52:28Z","title":"From Fake to Real (FFR): A two-stage training pipeline for mitigating\n  spurious correlations with synthetic data","summary":"  Visual recognition models are prone to learning spurious correlations induced\nby an imbalanced training set where certain groups (\\eg Females) are\nunder-represented in certain classes (\\eg Programmers). Generative models offer\na promising direction in mitigating this bias by generating synthetic data for\nthe minority samples and thus balancing the training set. However, prior work\nthat uses these approaches overlooks that visual recognition models could often\nlearn to differentiate between real and synthetic images and thus fail to\nunlearn the bias in the original dataset. In our work, we propose a novel\ntwo-stage pipeline to mitigate this issue where 1) we pre-train a model on a\nbalanced synthetic dataset and then 2) fine-tune on the real data. Using this\npipeline, we avoid training on both real and synthetic data, thus avoiding the\nbias between real and synthetic data. Moreover, we learn robust features\nagainst the bias in the first step that mitigate the bias in the second step.\nMoreover, our pipeline naturally integrates with bias mitigation methods; they\ncan be simply applied to the fine-tuning step. As our experiments prove, our\npipeline can further improve the performance of bias mitigation methods\nobtaining state-of-the-art performance on three large-scale datasets.\n","authors":["Maan Qraitem","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2308.04553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04551v1","updated":"2023-08-08T19:45:06Z","published":"2023-08-08T19:45:06Z","title":"Improving Medical Image Classification in Noisy Labels Using Only\n  Self-supervised Pretraining","summary":"  Noisy labels hurt deep learning-based supervised image classification\nperformance as the models may overfit the noise and learn corrupted feature\nextractors. For natural image classification training with noisy labeled data,\nmodel initialization with contrastive self-supervised pretrained weights has\nshown to reduce feature corruption and improve classification performance.\nHowever, no works have explored: i) how other self-supervised approaches, such\nas pretext task-based pretraining, impact the learning with noisy label, and\nii) any self-supervised pretraining methods alone for medical images in noisy\nlabel settings. Medical images often feature smaller datasets and subtle inter\nclass variations, requiring human expertise to ensure correct classification.\nThus, it is not clear if the methods improving learning with noisy labels in\nnatural image datasets such as CIFAR would also help with medical images. In\nthis work, we explore contrastive and pretext task-based self-supervised\npretraining to initialize the weights of a deep learning classification model\nfor two medical datasets with self-induced noisy labels -- NCT-CRC-HE-100K\ntissue histological images and COVID-QU-Ex chest X-ray images. Our results show\nthat models initialized with pretrained weights obtained from self-supervised\nlearning can effectively learn better features and improve robustness against\nnoisy labels.\n","authors":["Bidur Khanal","Binod Bhattarai","Bishesh Khanal","Cristian A. Linte"],"pdf_url":"https://arxiv.org/pdf/2308.04551v1.pdf","comment":"Accepted at MICCAI 2023 DEMI Workshop"},{"id":"http://arxiv.org/abs/2307.10763v2","updated":"2023-08-08T19:31:20Z","published":"2023-07-20T10:53:12Z","title":"Actor-agnostic Multi-label Action Recognition with Multi-modal Query","summary":"  Existing action recognition methods are typically actor-specific due to the\nintrinsic topological and apparent differences among the actors. This requires\nactor-specific pose estimation (e.g., humans vs. animals), leading to\ncumbersome model design complexity and high maintenance costs. Moreover, they\noften focus on learning the visual modality alone and single-label\nclassification whilst neglecting other available information sources (e.g.,\nclass name text) and the concurrent occurrence of multiple actions. To overcome\nthese limitations, we propose a new approach called 'actor-agnostic multi-modal\nmulti-label action recognition,' which offers a unified solution for various\ntypes of actors, including humans and animals. We further formulate a novel\nMulti-modal Semantic Query Network (MSQNet) model in a transformer-based object\ndetection framework (e.g., DETR), characterized by leveraging visual and\ntextual modalities to represent the action classes better. The elimination of\nactor-specific model designs is a key advantage, as it removes the need for\nactor pose estimation altogether. Extensive experiments on five publicly\navailable benchmarks show that our MSQNet consistently outperforms the prior\narts of actor-specific alternatives on human and animal single- and multi-label\naction recognition tasks by up to 50%. Code will be released at\nhttps://github.com/mondalanindya/MSQNet.\n","authors":["Anindya Mondal","Sauradip Nag","Joaquin M Prada","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2307.10763v2.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n  Vision Workshops (ICCVW), Paris, France"},{"id":"http://arxiv.org/abs/2308.04539v1","updated":"2023-08-08T19:12:52Z","published":"2023-08-08T19:12:52Z","title":"Improving Performance in Continual Learning Tasks using Bio-Inspired\n  Architectures","summary":"  The ability to learn continuously from an incoming data stream without\ncatastrophic forgetting is critical to designing intelligent systems. Many\napproaches to continual learning rely on stochastic gradient descent and its\nvariants that employ global error updates, and hence need to adopt strategies\nsuch as memory buffers or replay to circumvent its stability, greed, and\nshort-term memory limitations. To address this limitation, we have developed a\nbiologically inspired lightweight neural network architecture that incorporates\nsynaptic plasticity mechanisms and neuromodulation and hence learns through\nlocal error signals to enable online continual learning without stochastic\ngradient descent.\n  Our approach leads to superior online continual learning performance on\nSplit-MNIST, Split-CIFAR-10, and Split-CIFAR-100 datasets compared to other\nmemory-constrained learning approaches and matches that of the state-of-the-art\nmemory-intensive replay-based approaches. We further demonstrate the\neffectiveness of our approach by integrating key design concepts into other\nbackpropagation-based continual learning algorithms, significantly improving\ntheir accuracy. Our results provide compelling evidence for the importance of\nincorporating biological principles into machine learning models and offer\ninsights into how we can leverage them to design more efficient and robust\nsystems for online continual learning.\n","authors":["Sandeep Madireddy","Angel Yanguas-Gil","Prasanna Balaprakash"],"pdf_url":"https://arxiv.org/pdf/2308.04539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04522v1","updated":"2023-08-08T18:37:24Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":"  Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04511v1","updated":"2023-08-08T18:18:31Z","published":"2023-08-08T18:18:31Z","title":"MT-IceNet -- A Spatial and Multi-Temporal Deep Learning Model for Arctic\n  Sea Ice Forecasting","summary":"  Arctic amplification has altered the climate patterns both regionally and\nglobally, resulting in more frequent and more intense extreme weather events in\nthe past few decades. The essential part of Arctic amplification is the\nunprecedented sea ice loss as demonstrated by satellite observations.\nAccurately forecasting Arctic sea ice from sub-seasonal to seasonal scales has\nbeen a major research question with fundamental challenges at play. In addition\nto physics-based Earth system models, researchers have been applying multiple\nstatistical and machine learning models for sea ice forecasting. Looking at the\npotential of data-driven approaches to study sea ice variations, we propose\nMT-IceNet - a UNet based spatial and multi-temporal (MT) deep learning model\nfor forecasting Arctic sea ice concentration (SIC). The model uses an\nencoder-decoder architecture with skip connections and processes multi-temporal\ninput streams to regenerate spatial maps at future timesteps. Using bi-monthly\nand monthly satellite retrieved sea ice data from NSIDC as well as atmospheric\nand oceanic variables from ERA5 reanalysis product during 1979-2021, we show\nthat our proposed model provides promising predictive performance for per-pixel\nSIC forecasting with up to 60% decrease in prediction error for a lead time of\n6 months as compared to its state-of-the-art counterparts.\n","authors":["Sahara Ali","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04511v1.pdf","comment":"Published at IEEE BDCAT 2022. This version includes minor updates\n  made in the text after original publication"},{"id":"http://arxiv.org/abs/2303.07122v3","updated":"2023-08-08T18:13:50Z","published":"2023-02-22T19:35:28Z","title":"Quantifying Causes of Arctic Amplification via Deep Learning based\n  Time-series Causal Inference","summary":"  The warming of the Arctic, also known as Arctic amplification, is led by\nseveral atmospheric and oceanic drivers. However, the details of its underlying\nthermodynamic causes are still unknown. Inferring the causal effects of\natmospheric processes on sea ice melt using fixed treatment effect strategies\nleads to unrealistic counterfactual estimations. Such models are also prone to\nbias due to time-varying confoundedness. Further, the complex non-linearity in\nEarth science data makes it infeasible to perform causal inference using\nexisting marginal structural techniques. In order to tackle these challenges,\nwe propose TCINet - time-series causal inference model to infer causation under\ncontinuous treatment using recurrent neural networks and a novel probabilistic\nbalancing technique. Through experiments on synthetic and observational data,\nwe show how our research can substantially improve the ability to quantify\nleading causes of Arctic sea ice melt, further paving paths for causal\ninference in observational Earth science.\n","authors":["Sahara Ali","Omar Faruque","Yiyi Huang","Md. Osman Gani","Aneesh Subramanian","Nicole-Jienne Shchlegel","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2303.07122v3.pdf","comment":"In review at IEEE ICMLA 2023"},{"id":"http://arxiv.org/abs/2303.04673v2","updated":"2023-08-08T18:04:11Z","published":"2023-03-08T15:52:14Z","title":"Cost-Effective Hyperparameter Optimization for Large Language Model\n  Generation Inference","summary":"  Large Language Models (LLMs) have sparked significant interest in their\ngenerative capabilities, leading to the development of various commercial\napplications. The high cost of using the models drives application builders to\nmaximize the value of generation under a limited inference budget. This paper\npresents a study of optimizing inference hyperparameters such as the number of\nresponses, temperature and max tokens, which significantly affects the\nutility/cost of text generation. We design a framework named EcoOptiGen which\nleverages economical hyperparameter optimization and cost-based pruning.\nExperiments with the GPT-3.5/GPT-4 models on a variety of tasks verify its\neffectiveness. EcoOptiGen is implemented in the `autogen' package of the FLAML\nlibrary: \\url{https://aka.ms/autogen}.\n","authors":["Chi Wang","Susan Xueqing Liu","Ahmed H. Awadallah"],"pdf_url":"https://arxiv.org/pdf/2303.04673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04493v1","updated":"2023-08-08T18:01:26Z","published":"2023-08-08T18:01:26Z","title":"Efficient option pricing with unary-based photonic computing chip and\n  generative adversarial learning","summary":"  In the modern financial industry system, the structure of products has become\nmore and more complex, and the bottleneck constraint of classical computing\npower has already restricted the development of the financial industry. Here,\nwe present a photonic chip that implements the unary approach to European\noption pricing, in combination with the quantum amplitude estimation algorithm,\nto achieve a quadratic speedup compared to classical Monte Carlo methods. The\ncircuit consists of three modules: a module loading the distribution of asset\nprices, a module computing the expected payoff, and a module performing the\nquantum amplitude estimation algorithm to introduce speed-ups. In the\ndistribution module, a generative adversarial network is embedded for efficient\nlearning and loading of asset distributions, which precisely capture the market\ntrends. This work is a step forward in the development of specialized photonic\nprocessors for applications in finance, with the potential to improve the\nefficiency and quality of financial services.\n","authors":["Hui Zhang","Lingxiao Wan","Sergi Ramos-Calderer","Yuancheng Zhan","Wai-Keong Mok","Hong Cai","Feng Gao","Xianshu Luo","Guo-Qiang Lo","Leong Chuan Kwek","José Ignacio Latorre","Ai Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04493v1.pdf","comment":"11 pages, 7 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04369v1","updated":"2023-08-08T16:15:35Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n  Transformer for Frame-Event based Recognition","summary":"  Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2306.16181v3","updated":"2023-08-08T15:50:35Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":"  Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04205v1","updated":"2023-08-08T12:11:19Z","published":"2023-08-08T12:11:19Z","title":"Collaborative Edge Caching: a Meta Reinforcement Learning Approach with\n  Edge Sampling","summary":"  Current learning-based edge caching schemes usually suffer from dynamic\ncontent popularity, e.g., in the emerging short video platforms, users' request\npatterns shift significantly over time and across different edges. An intuitive\nsolution for a specific local edge cache is to collect more request histories\nfrom other edge caches. However, uniformly merging these request histories may\nnot perform satisfactorily due to heterogeneous content distributions on\ndifferent edges. To solve this problem, we propose a collaborative edge caching\nframework. First, we design a meta-learning-based collaborative strategy to\nguarantee that the local model can timely meet the continually changing content\npopularity. Then, we design an edge sampling method to select more \"valuable\"\nneighbor edges to participate in the local training. To evaluate the proposed\nframework, we conduct trace-driven experiments to demonstrate the effectiveness\nof our design: it improves the average cache hit rate by up to $10.12\\%$\n(normalized) compared with other baselines.\n","authors":["Bowei He","Yinan Mao","Shiji Zhou","Chen Ma","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04205v1.pdf","comment":"Published on IEEE International Conference on Multimedia and Expo\n  2023 (ICME2023)"},{"id":"http://arxiv.org/abs/2308.04156v1","updated":"2023-08-08T09:37:18Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n  Attention","summary":"  Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04132v1","updated":"2023-08-08T08:43:18Z","published":"2023-08-08T08:43:18Z","title":"Optimizing Adaptive Video Streaming with Human Feedback","summary":"  Quality of Experience~(QoE)-driven adaptive bitrate~(ABR) algorithms are\ntypically optimized using QoE models that are based on the mean opinion\nscore~(MOS), while such principles may not account for user heterogeneity on\nrating scales, resulting in unexpected behaviors. In this paper, we propose\n\\texttt{Jade}, which leverages reinforcement learning with human\nfeedback~(RLHF) technologies to better align the users' opinion scores.\n\\texttt{Jade}'s rank-based QoE model considers relative values of user ratings\nto interpret the subjective perception of video sessions. We implement\nlinear-based and Deep Neural Network (DNN)-based architectures for satisfying\nboth accuracy and generalization ability. We further propose entropy-aware\nreinforced mechanisms for training policies with the integration of the\nproposed QoE models. Experimental results demonstrate that \\texttt{Jade}\nperforms favorably on conventional metrics, such as quality and stall ratio,\nand improves QoE by 8.09\\%-38.13\\% in different network conditions, emphasizing\nthe importance of user heterogeneity in QoE modeling and the potential of\ncombining linear-based and DNN-based models for performance improvement.\n","authors":["Tianchi Huang","Rui-Xiao Zhang","Chenglei Wu","Lifeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04132v1.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04118v1","updated":"2023-08-08T08:17:39Z","published":"2023-08-08T08:17:39Z","title":"Multimodal Color Recommendation in Vector Graphic Documents","summary":"  Color selection plays a critical role in graphic document design and requires\nsufficient consideration of various contexts. However, recommending appropriate\ncolors which harmonize with the other colors and textual contexts in documents\nis a challenging task, even for experienced designers. In this study, we\npropose a multimodal masked color model that integrates both color and textual\ncontexts to provide text-aware color recommendation for graphic documents. Our\nproposed model comprises self-attention networks to capture the relationships\nbetween colors in multiple palettes, and cross-attention networks that\nincorporate both color and CLIP-based text representations. Our proposed method\nprimarily focuses on color palette completion, which recommends colors based on\nthe given colors and text. Additionally, it is applicable for another color\nrecommendation task, full palette generation, which generates a complete color\npalette corresponding to the given text. Experimental results demonstrate that\nour proposed approach surpasses previous color palette completion methods on\naccuracy, color distribution, and user experience, as well as full palette\ngeneration methods concerning color diversity and similarity to the ground\ntruth palettes.\n","authors":["Qianru Qiu","Xueting Wang","Mayu Otani"],"pdf_url":"https://arxiv.org/pdf/2308.04118v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03463v2","updated":"2023-08-08T07:54:55Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao"],"pdf_url":"https://arxiv.org/pdf/2308.03463v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04025v1","updated":"2023-08-08T03:43:24Z","published":"2023-08-08T03:43:24Z","title":"MSAC: Multiple Speech Attribute Control Method for Speech Emotion\n  Recognition","summary":"  Despite significant progress, speech emotion recognition (SER) remains\nchallenging due to inherent complexity and ambiguity of the emotion attribute,\nparticularly in wild world. Whereas current studies primarily focus on\nrecognition and generalization capabilities, this work pioneers an exploration\ninto the reliability of SER methods and investigates how to model the speech\nemotion from the aspect of data distribution across various speech attributes.\nSpecifically, we first build a novel CNN-based SER model which adopts additive\nmargin softmax loss to expand the distance between features of different\nclasses, thereby enhancing their discrimination. Second, a novel multiple\nspeech attribute control method MSAC is proposed to explicitly control speech\nattributes, enabling the model to be less affected by emotion-agnostic\nattributes and capture more fine-grained emotion-related features. Third, we\nmake a first attempt to test and analyze the reliability of the proposed SER\nworkflow using the out-of-distribution detection method. Extensive experiments\non both single and cross-corpus SER scenarios show that our proposed unified\nSER workflow consistently outperforms the baseline in terms of recognition,\ngeneralization, and reliability performance. Besides, in single-corpus SER, the\nproposed SER workflow achieves superior recognition results with a WAR of\n72.97\\% and a UAR of 71.76\\% on the IEMOCAP corpus.\n","authors":["Yu Pan"],"pdf_url":"https://arxiv.org/pdf/2308.04025v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.07848v6","updated":"2023-08-08T03:41:47Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining approaches have\nrecently exhibited impressive success in diverse fields. In this paper, we\npropose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive\nlanguage-audio pretraining (CLAP) method for speech emotion recognition.\nSpecifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing\npre-trained WavLM and RoBERTa models. Second, given the significance of the\ngender attribute in speech emotion modeling, two novel soft label based\nGEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)\nmodels are further proposed to integrate emotion and gender information of\nspeech signals, forming more reasonable objectives. Extensive experiments on\nIEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the\nbaseline Emo-CLAP, while also achieving the best recognition performance\ncompared with recent state-of-the-art methods. Noticeably, the proposed\nSL-GEmo-CLAP model achieves the best UAR of 81.43\\% and WAR of 83.16\\% which\nperforms better than other state-of-the-art SER methods by at least 3\\%.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Jixun Yao","Wen Fei","Lei Ma","Heng Lu"],"pdf_url":"https://arxiv.org/pdf/2306.07848v6.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.02227v2","updated":"2023-08-08T02:19:48Z","published":"2023-07-05T12:08:56Z","title":"MAE-DFER: Efficient Masked Autoencoder for Self-supervised Dynamic\n  Facial Expression Recognition","summary":"  Dynamic facial expression recognition (DFER) is essential to the development\nof intelligent and empathetic machines. Prior efforts in this field mainly fall\ninto supervised learning paradigm, which is severely restricted by the limited\nlabeled data in existing datasets. Inspired by recent unprecedented success of\nmasked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel\nself-supervised method which leverages large-scale self-supervised pre-training\non abundant unlabeled data to largely advance the development of DFER. Since\nthe vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial\ncomputation during fine-tuning, MAE-DFER develops an efficient local-global\ninteraction Transformer (LGI-Former) as the encoder. Moreover, in addition to\nthe standalone appearance content reconstruction in VideoMAE, MAE-DFER also\nintroduces explicit temporal facial motion modeling to encourage LGI-Former to\nexcavate both static appearance and dynamic motion information. Extensive\nexperiments on six datasets show that MAE-DFER consistently outperforms\nstate-of-the-art supervised methods by significant margins (e.g., +6.30\\% UAR\non DFEW and +8.34\\% UAR on MAFW), verifying that it can learn powerful dynamic\nfacial representations via large-scale self-supervised pre-training. Besides,\nit has comparable or even better performance than VideoMAE, while largely\nreducing the computational cost (about 38\\% FLOPs). We believe MAE-DFER has\npaved a new way for the advancement of DFER and can inspire more relevant\nresearch in this field and even other related tasks. Codes and models are\npublicly available at https://github.com/sunlicai/MAE-DFER.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2307.02227v2.pdf","comment":"ACM MM 2023 (camera ready). Codes and models are publicly available\n  at https://github.com/sunlicai/MAE-DFER"},{"id":"http://arxiv.org/abs/2308.04603v1","updated":"2023-08-08T22:06:14Z","published":"2023-08-08T22:06:14Z","title":"Deep Learning based Image Watermarking: A Brief Survey","summary":"  The act of secretly embedding and extracting a watermark on a cover image to\nprotect it is known as image watermarking. In recent years, deep learning-based\nimage watermarking techniques have been emerging one after another. To study\nthe state-of-the-art, this survey categorizes cutting-edge deep learning-based\nimage watermarking techniques into Embedder-Extractor Joint Training, Deep\nNetworks as a Feature Transformation, and Hybrid schemes. Research directions\nin each category are also analyzed and summarized. Additionally, potential\nfuture research directions are discussed to envision future studies.\n","authors":["Xin Zhong","Arjon Das","Fahad Alrasheedi","Abdullah Tanvir"],"pdf_url":"https://arxiv.org/pdf/2308.04603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04522v1","updated":"2023-08-08T18:37:24Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":"  Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v1.pdf","comment":null}]},"2023-08-09T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.05081v1","updated":"2023-08-09T17:20:14Z","published":"2023-08-09T17:20:14Z","title":"Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic\n  Role Labeling","summary":"  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from\ngiven videos, by recognizing the predict-argument event structures and the\ninterrelationships between events. While recent endeavors have put forth\nmethods for VidSRL, they can be mostly subject to two key drawbacks, including\nthe lack of fine-grained spatial scene perception and the insufficiently\nmodeling of video temporality. Towards this end, this work explores a novel\nholistic spatio-temporal scene graph (namely HostSG) representation based on\nthe existing dynamic scene graph structures, which well model both the\nfine-grained spatial semantics and temporal dynamics of videos for VidSRL.\nBuilt upon the HostSG, we present a nichetargeting VidSRL framework. A\nscene-event mapping mechanism is first designed to bridge the gap between the\nunderlying scene structure and the high-level event semantic structure,\nresulting in an overall hierarchical scene-event (termed ICE) graph structure.\nWe further perform iterative structure refinement to optimize the ICE graph,\nsuch that the overall structure representation can best coincide with end task\ndemand. Finally, three subtask predictions of VidSRL are jointly decoded, where\nthe end-to-end paradigm effectively avoids error propagation. On the benchmark\ndataset, our framework boosts significantly over the current best-performing\nmodel. Further analyses are shown for a better understanding of the advances of\nour methods.\n","authors":["Yu Zhao","Hao Fei","Yixin Cao","Bobo Li","Meishan Zhang","Jianguo Wei","Min Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.05081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05046v1","updated":"2023-08-09T16:19:43Z","published":"2023-08-09T16:19:43Z","title":"RadGraph2: Modeling Disease Progression in Radiology Reports via\n  Hierarchical Information Extraction","summary":"  We present RadGraph2, a novel dataset for extracting information from\nradiology reports that focuses on capturing changes in disease state and device\nplacement over time. We introduce a hierarchical schema that organizes entities\nbased on their relationships and show that using this hierarchy during training\nimproves the performance of an information extraction model. Specifically, we\npropose a modification to the DyGIE++ framework, resulting in our model HGIE,\nwhich outperforms previous models in entity and relation extraction tasks. We\ndemonstrate that RadGraph2 enables models to capture a wider variety of\nfindings and perform better at relation extraction compared to those trained on\nthe original RadGraph dataset. Our work provides the foundation for developing\nautomated systems that can track disease progression over time and develop\ninformation extraction models that leverage the natural hierarchy of labels in\nthe medical domain.\n","authors":["Sameer Khanna","Adam Dejl","Kibo Yoon","Quoc Hung Truong","Hanh Duong","Agustina Saenz","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2308.05046v1.pdf","comment":"Accepted at Machine Learning for Healthcare 2023"},{"id":"http://arxiv.org/abs/2308.04992v1","updated":"2023-08-09T14:45:13Z","published":"2023-08-09T14:45:13Z","title":"AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities","summary":"  Multi-modal knowledge graphs (MMKGs) combine different modal data (e.g., text\nand image) for a comprehensive understanding of entities. Despite the recent\nprogress of large-scale MMKGs, existing MMKGs neglect the multi-aspect nature\nof entities, limiting the ability to comprehend entities from various\nperspectives. In this paper, we construct AspectMMKG, the first MMKG with\naspect-related images by matching images to different entity aspects.\nSpecifically, we collect aspect-related images from a knowledge base, and\nfurther extract aspect-related sentences from the knowledge base as queries to\nretrieve a large number of aspect-related images via an online image search\nengine. Finally, AspectMMKG contains 2,380 entities, 18,139 entity aspects, and\n645,383 aspect-related images. We demonstrate the usability of AspectMMKG in\nentity aspect linking (EAL) downstream task and show that previous EAL models\nachieve a new state-of-the-art performance with the help of AspectMMKG. To\nfacilitate the research on aspect-related MMKG, we further propose an\naspect-related image retrieval (AIR) model, that aims to correct and expand\naspect-related images in AspectMMKG. We train an AIR model to learn the\nrelationship between entity image and entity aspect-related images by\nincorporating entity image, aspect, and aspect image information. Experimental\nresults indicate that the AIR model could retrieve suitable images for a given\nentity w.r.t different aspects.\n","authors":["Jingdan Zhang","Jiaan Wang","Xiaodan Wang","Zhixu Li","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.04992v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.04982v1","updated":"2023-08-09T14:31:57Z","published":"2023-08-09T14:31:57Z","title":"Exploring Multilingual Text Data Distillation","summary":"  With the rise of deep learning, large datasets and complex models have become\ncommon, requiring significant computing power. To address this, data\ndistillation has emerged as a technique to quickly train models with lower\nmemory and time requirements. However, data distillation on text-based datasets\nhasn't been explored much because of the challenges rising due to its discrete\nnature. Additionally, existing dataset distillation methods often struggle to\ngeneralize to new architectures. In the paper, we propose several data\ndistillation techniques for multilingual text classification datasets using\nlanguage-model-based learning methods. We conduct experiments to analyze their\nperformance in terms of classification strength, and cross-architecture\ngeneralization. Furthermore, we investigate the language-specific fairness of\nthe data summaries generated by these methods. Our approach builds upon\nexisting techniques, enhancing cross-architecture generalization in the text\ndata distillation domain.\n","authors":["Shivam Sahni","Harsh Patel"],"pdf_url":"https://arxiv.org/pdf/2308.04982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04950v1","updated":"2023-08-09T13:33:27Z","published":"2023-08-09T13:33:27Z","title":"Performance Analysis of Transformer Based Models (BERT, ALBERT and\n  RoBERTa) in Fake News Detection","summary":"  Fake news is fake material in a news media format but is not processed\nproperly by news agencies. The fake material can provoke or defame significant\nentities or individuals or potentially even for the personal interests of the\ncreators, causing problems for society. Distinguishing fake news and real news\nis challenging due to limited of domain knowledge and time constraints.\nAccording to the survey, the top three areas most exposed to hoaxes and\nmisinformation by residents are in Banten, DKI Jakarta and West Java. The model\nof transformers is referring to an approach in the field of artificial\nintelligence (AI) in natural language processing utilizing the deep learning\narchitectures. Transformers exercise a powerful attention mechanism to process\ntext in parallel and produce rich and contextual word representations. A\nprevious study indicates a superior performance of a transformer model known as\nBERT over and above non transformer approach. However, some studies suggest the\nperformance can be improved with the use of improved BERT models known as\nALBERT and RoBERTa. However, the modified BERT models are not well explored for\ndetecting fake news in Bahasa Indonesia. In this research, we explore those\ntransformer models and found that ALBERT outperformed other models with 87.6%\naccuracy, 86.9% precision, 86.9% F1-score, and 174.5 run-time (s/epoch)\nrespectively. Source code available at:\nhttps://github.com/Shafna81/fakenewsdetection.git\n","authors":["Shafna Fitria Nur Azizah","Hasan Dwi Cahyono","Sari Widya Sihwi","Wisnu Widiarto"],"pdf_url":"https://arxiv.org/pdf/2308.04950v1.pdf","comment":"6 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.04948v1","updated":"2023-08-09T13:32:06Z","published":"2023-08-09T13:32:06Z","title":"Extrapolating Large Language Models to Non-English by Aligning Languages","summary":"  Due to the unbalanced training data distribution, the language ability of\nlarge language models (LLMs) is often biased towards English. In this paper, we\npropose to empower pre-trained LLMs on non-English languages by building\nsemantic alignment across languages. We perform instruction-tuning on LLaMA\nwith both translation task data and cross-lingual general task data to obtain\ncross-lingual models (x-LLaMA). Experiment results on cross-lingual benchmark\nXQUAD and MLQA show that x-LLaMA models outperform the English\ninstruction-tuned counterpart (Alpaca) by 42.50% on average on six non-English\nlanguages. Further experiments on Chinese benchmark C-Eval show that x-LLaMA\nachieves significant improvement on Chinese humanities tasks, outperforming\nAlpaca by 8.2%. We also discover that incorporating non-English text on the\ntarget side of translation data is particularly effective for boosting\nnon-English ability. Besides, we find that semantic alignment within LLM can be\nfurther strengthened as translation task data scales up and we present the\nformulation of the underlying scaling law. Evaluation results on translation\ndataset Flores-101 show that \\method outperforms previous LLaMA-based models in\nall evaluated directions. Code and data will be available at:\nhttps://github.com/OwenNJU/x-LLM.\n","authors":["Wenhao Zhu","Yunzhe Lv","Qingxiu Dong","Fei Yuan","Jingjing Xu","Shujian Huang","Lingpeng Kong","Jiajun Chen","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2308.04948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04945v1","updated":"2023-08-09T13:22:37Z","published":"2023-08-09T13:22:37Z","title":"LLMeBench: A Flexible Framework for Accelerating LLMs Benchmarking","summary":"  The recent development and success of Large Language Models (LLMs)\nnecessitate an evaluation of their performance across diverse NLP tasks in\ndifferent languages. Although several frameworks have been developed and made\npublicly available, their customization capabilities for specific tasks and\ndatasets are often complex for different users. In this study, we introduce the\nLLMeBench framework. Initially developed to evaluate Arabic NLP tasks using\nOpenAI's GPT and BLOOM models; it can be seamlessly customized for any NLP task\nand model, regardless of language. The framework also features zero- and\nfew-shot learning settings. A new custom dataset can be added in less than 10\nminutes, and users can use their own model API keys to evaluate the task at\nhand. The developed framework has been already tested on 31 unique NLP tasks\nusing 53 publicly available datasets within 90 experimental setups, involving\napproximately 296K data points. We plan to open-source the framework for the\ncommunity (https://github.com/qcri/LLMeBench/). A video demonstrating the\nframework is available online (https://youtu.be/FkQn4UjYA0s).\n","authors":["Fahim Dalvi","Maram Hasanain","Sabri Boughorbel","Basel Mousi","Samir Abdaljalil","Nizi Nazar","Ahmed Abdelali","Shammur Absar Chowdhury","Hamdy Mubarak","Ahmed Ali","Majd Hawasly","Nadir Durrani","Firoj Alam"],"pdf_url":"https://arxiv.org/pdf/2308.04945v1.pdf","comment":"Foundation Models, Large Language Models, NLP, CHatGPT Evaluation,\n  LLMs Benchmark"},{"id":"http://arxiv.org/abs/2308.04941v1","updated":"2023-08-09T13:16:30Z","published":"2023-08-09T13:16:30Z","title":"Integrating large language models and active inference to understand eye\n  movements in reading and dyslexia","summary":"  We present a novel computational model employing hierarchical active\ninference to simulate reading and eye movements. The model characterizes\nlinguistic processing as inference over a hierarchical generative model,\nfacilitating predictions and inferences at various levels of granularity, from\nsyllables to sentences.\n  Our approach combines the strengths of large language models for realistic\ntextual predictions and active inference for guiding eye movements to\ninformative textual information, enabling the testing of predictions. The model\nexhibits proficiency in reading both known and unknown words and sentences,\nadhering to the distinction between lexical and nonlexical routes in dual-route\ntheories of reading. Notably, our model permits the exploration of maladaptive\ninference effects on eye movements during reading, such as in dyslexia. To\nsimulate this condition, we attenuate the contribution of priors during the\nreading process, leading to incorrect inferences and a more fragmented reading\nstyle, characterized by a greater number of shorter saccades. This alignment\nwith empirical findings regarding eye movements in dyslexic individuals\nhighlights the model's potential to aid in understanding the cognitive\nprocesses underlying reading and eye movements, as well as how reading deficits\nassociated with dyslexia may emerge from maladaptive predictive processing.\n  In summary, our model represents a significant advancement in comprehending\nthe intricate cognitive processes involved in reading and eye movements, with\npotential implications for understanding and addressing dyslexia through the\nsimulation of maladaptive inference. It may offer valuable insights into this\ncondition and contribute to the development of more effective interventions for\ntreatment.\n","authors":["Francesco Donnarumma","Mirco Frosolone","Giovanni Pezzulo"],"pdf_url":"https://arxiv.org/pdf/2308.04941v1.pdf","comment":"23 pages, 1 Appendix, 11 Tables, 9 Figures"},{"id":"http://arxiv.org/abs/2308.04913v1","updated":"2023-08-09T12:26:37Z","published":"2023-08-09T12:26:37Z","title":"LLaMA-E: Empowering E-commerce Authoring with Multi-Aspect Instruction\n  Following","summary":"  E-commerce authoring involves creating attractive, abundant, and targeted\npromotional content to drive product sales. The emergence of large language\nmodels (LLMs) introduces an innovative paradigm, offering a unified solution to\naddress various authoring tasks within this scenario. However, mainstream LLMs\ntrained on general corpora with common sense knowledge reveal limitations in\nfitting complex and personalized features unique to e-commerce products and\ncustomers. Furthermore, LLMs like GPT-3.5 necessitate remote accessibility,\nraising concerns about safeguarding voluminous customer privacy data during\ntransmission. This paper proposes the LLaMA-E, the unified and customized\ninstruction-following language models focusing on diverse e-commerce authoring\ntasks. Specifically, the domain experts create the seed instruction set from\nthe tasks of ads generation, query-enhanced product title rewriting, product\nclassification, purchase intent speculation, and general Q&A. These tasks\nenable the models to comprehensively understand precise e-commerce authoring\nknowledge by interleaving features covering typical service aspects of\ncustomers, sellers, and platforms. The GPT-3.5 is introduced as a teacher\nmodel, which expands the seed instructions to form a training set for the\nLLaMA-E models with various scales. The experimental results show that the\nproposed LLaMA-E models achieve state-of-the-art results in quantitative and\nqualitative evaluations, also exhibiting the advantage in zero-shot scenes. To\nthe best of our knowledge, this study is the first to serve the LLMs to\nspecific e-commerce authoring scenarios.\n","authors":["Kaize Shi","Xueyao Sun","Dingxian Wang","Yinlin Fu","Guandong Xu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2308.04913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05113v2","updated":"2023-08-09T12:08:46Z","published":"2023-07-11T08:45:46Z","title":"Go Beyond The Obvious: Probing the gap of INFORMAL reasoning ability\n  between Humanity and LLMs by Detective Reasoning Puzzle Benchmark","summary":"  Informal reasoning ability is the ability to reason based on common sense,\nexperience, and intuition.Humans use informal reasoning every day to extract\nthe most influential elements for their decision-making from a large amount of\nlife-like information.With the rapid development of language models, the\nrealization of general artificial intelligence has emerged with hope. Given the\noutstanding informal reasoning ability of humans, how much informal reasoning\nability language models have has not been well studied by scholars.In order to\nexplore the gap between humans and language models in informal reasoning\nability, this paper constructs a Detective Reasoning Benchmark, which is an\nassembly of 1,200 questions gathered from accessible online resources, aims at\nevaluating the model's informal reasoning ability in real-life\ncontext.Considering the improvement of the model's informal reasoning ability\nrestricted by the lack of benchmark, we further propose a Self-Question Prompt\nFramework that mimics human thinking to enhance the model's informal reasoning\nability.The goals of self-question are to find key elements, deeply investigate\nthe connections between these elements, encourage the relationship between each\nelement and the problem, and finally, require the model to reasonably answer\nthe problem.The experimental results show that human performance greatly\noutperforms the SoTA Language Models in Detective Reasoning Benchmark.Besides,\nSelf-Question is proven to be the most effective prompt engineering in\nimproving GPT-4's informal reasoning ability, but it still does not even\nsurpass the lowest score made by human participants.Upon acceptance of the\npaper, the source code for the benchmark will be made publicly accessible.\n","authors":["Zhouhon Gu","Zihan Li","Lin Zhang","Zhuozhi Xiong","Haoning Ye","Yikai Zhang","Wenhao Huang","Xiaoxuan Zhu","Qianyu He","Rui Xu","Sihang Jiang","Shusen Wang","Zili Wang","Hongwei Feng","Zhixu Li","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.05113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04886v1","updated":"2023-08-09T11:33:53Z","published":"2023-08-09T11:33:53Z","title":"Unsupervised Out-of-Distribution Dialect Detection with Mahalanobis\n  Distance","summary":"  Dialect classification is used in a variety of applications, such as machine\ntranslation and speech recognition, to improve the overall performance of the\nsystem. In a real-world scenario, a deployed dialect classification model can\nencounter anomalous inputs that differ from the training data distribution,\nalso called out-of-distribution (OOD) samples. Those OOD samples can lead to\nunexpected outputs, as dialects of those samples are unseen during model\ntraining. Out-of-distribution detection is a new research area that has\nreceived little attention in the context of dialect classification. Towards\nthis, we proposed a simple yet effective unsupervised Mahalanobis distance\nfeature-based method to detect out-of-distribution samples. We utilize the\nlatent embeddings from all intermediate layers of a wav2vec 2.0\ntransformer-based dialect classifier model for multi-task learning. Our\nproposed approach outperforms other state-of-the-art OOD detection methods\nsignificantly.\n","authors":["Sourya Dipta Das","Yash Vadi","Abhishek Unnam","Kuldeep Yadav"],"pdf_url":"https://arxiv.org/pdf/2308.04886v1.pdf","comment":"Accepted in Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.04885v1","updated":"2023-08-09T11:32:16Z","published":"2023-08-09T11:32:16Z","title":"Information-Theoretic Characterization of Vowel Harmony: A\n  Cross-Linguistic Study on Word Lists","summary":"  We present a cross-linguistic study that aims to quantify vowel harmony using\ndata-driven computational modeling. Concretely, we define an\ninformation-theoretic measure of harmonicity based on the predictability of\nvowels in a natural language lexicon, which we estimate using phoneme-level\nlanguage models (PLMs). Prior quantitative studies have relied heavily on\ninflected word-forms in the analysis of vowel harmony. We instead train our\nmodels using cross-linguistically comparable lemma forms with little or no\ninflection, which enables us to cover more under-studied languages. Training\ndata for our PLMs consists of word lists with a maximum of 1000 entries per\nlanguage. Despite the fact that the data we employ are substantially smaller\nthan previously used corpora, our experiments demonstrate the neural PLMs\ncapture vowel harmony patterns in a set of languages that exhibit this\nphenomenon. Our work also demonstrates that word lists are a valuable resource\nfor typological research, and offers new possibilities for future studies on\nlow-resource, under-studied languages.\n","authors":["Julius Steuer","Badr Abdullah","Johann-Mattis List","Dietrich Klakow"],"pdf_url":"https://arxiv.org/pdf/2308.04885v1.pdf","comment":"Presented at SIGTYP at EACL 2023"},{"id":"http://arxiv.org/abs/2307.12267v2","updated":"2023-08-09T11:25:30Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n  Essay in Education","summary":"  The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v2.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2308.03131v3","updated":"2023-08-09T10:49:20Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n  Reference Diversity in NLG Evaluation","summary":"  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.03131v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.09287v2","updated":"2023-08-09T10:45:52Z","published":"2023-05-16T08:46:11Z","title":"Adversarial Word Dilution as Text Data Augmentation in Low-Resource\n  Regime","summary":"  Data augmentation is widely used in text classification, especially in the\nlow-resource regime where a few examples for each class are available during\ntraining. Despite the success, generating data augmentations as hard positive\nexamples that may increase their effectiveness is under-explored. This paper\nproposes an Adversarial Word Dilution (AWD) method that can generate hard\npositive examples as text data augmentations to train the low-resource text\nclassification model efficiently. Our idea of augmenting the text data is to\ndilute the embedding of strong positive words by weighted mixing with\nunknown-word embedding, making the augmented inputs hard to be recognized as\npositive by the classification model. We adversarially learn the dilution\nweights through a constrained min-max optimization process with the guidance of\nthe labels. Empirical studies on three benchmark datasets show that AWD can\ngenerate more effective data augmentations and outperform the state-of-the-art\ntext data augmentation methods. The additional analysis demonstrates that the\ndata augmentations generated by AWD are interpretable and can flexibly extend\nto new examples without further training.\n","authors":["Junfan Chen","Richong Zhang","Zheyan Luo","Chunming Hu","Yongyi Mao"],"pdf_url":"https://arxiv.org/pdf/2305.09287v2.pdf","comment":"Preprint, Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2308.04857v1","updated":"2023-08-09T10:42:38Z","published":"2023-08-09T10:42:38Z","title":"Emotion-Conditioned Text Generation through Automatic Prompt\n  Optimization","summary":"  Conditional natural language generation methods often require either\nexpensive fine-tuning or training a large language model from scratch. Both are\nunlikely to lead to good results without a substantial amount of data and\ncomputational resources. Prompt learning without changing the parameters of a\nlarge language model presents a promising alternative. It is a cost-effective\napproach, while still achieving competitive results. While this procedure is\nnow established for zero- and few-shot text classification and structured\nprediction, it has received limited attention in conditional text generation.\nWe present the first automatic prompt optimization approach for\nemotion-conditioned text generation with instruction-fine-tuned models. Our\nmethod uses an iterative optimization procedure that changes the prompt by\nadding, removing, or replacing tokens. As objective function, we only require a\ntext classifier that measures the realization of the conditional variable in\nthe generated text. We evaluate the method on emotion-conditioned text\ngeneration with a focus on event reports and compare it to manually designed\nprompts that also act as the seed for the optimization procedure. The optimized\nprompts achieve 0.75 macro-average F1 to fulfill the emotion condition in\ncontrast to manually designed seed prompts with only 0.22 macro-average F1.\n","authors":["Yarik Menchaca Resendiz","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2308.04857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04832v1","updated":"2023-08-09T09:40:34Z","published":"2023-08-09T09:40:34Z","title":"TSSR: A Truncated and Signed Square Root Activation Function for Neural\n  Networks","summary":"  Activation functions are essential components of neural networks. In this\npaper, we introduce a new activation function called the Truncated and Signed\nSquare Root (TSSR) function. This function is distinctive because it is odd,\nnonlinear, monotone and differentiable. Its gradient is continuous and always\npositive. Thanks to these properties, it has the potential to improve the\nnumerical stability of neural networks. Several experiments confirm that the\nproposed TSSR has better performance than other stat-of-the-art activation\nfunctions. The proposed function has significant implications for the\ndevelopment of neural network models and can be applied to a wide range of\napplications in fields such as computer vision, natural language processing,\nand speech recognition.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2308.04832v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16389"},{"id":"http://arxiv.org/abs/2308.04823v1","updated":"2023-08-09T09:22:56Z","published":"2023-08-09T09:22:56Z","title":"Evaluating the Generation Capabilities of Large Chinese Language Models","summary":"  This paper presents CG-Eval, the first comprehensive evaluation of the\ngeneration capabilities of large Chinese language models across a wide range of\nacademic disciplines. The models' performance was assessed based on their\nability to generate accurate and relevant responses to different types of\nquestions in six disciplines, namely, Science and Engineering, Humanities and\nSocial Sciences, Mathematical Calculations, Medical Practitioner Qualification\nExamination, Judicial Examination, and Certified Public Accountant Examination.\nThis paper also presents Gscore, a composite index derived from the weighted\nsum of multiple metrics to measure the quality of model's generation against a\nreference. The test data and test results can be found at\nhttp://cgeval.besteasy.com/.\n","authors":["Hui Zeng","Jingyuan Xue","Meng Hao","Chen Sun","Bin Ning","Na Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04813v1","updated":"2023-08-09T09:11:31Z","published":"2023-08-09T09:11:31Z","title":"CLEVA: Chinese Language Models EVAluation Platform","summary":"  With the continuous emergence of Chinese Large Language Models (LLMs), how to\nevaluate a model's capabilities has become an increasingly significant issue.\nThe absence of a comprehensive Chinese benchmark that thoroughly assesses a\nmodel's performance, the unstandardized and incomparable prompting procedure,\nand the prevalent risk of contamination pose major challenges in the current\nevaluation of Chinese LLMs. We present CLEVA, a user-friendly platform crafted\nto holistically evaluate Chinese LLMs. Our platform employs a standardized\nworkflow to assess LLMs' performance across various dimensions, regularly\nupdating a competitive leaderboard. To alleviate contamination, CLEVA curates a\nsignificant proportion of new data and develops a sampling strategy that\nguarantees a unique subset for each leaderboard round. Empowered by an\neasy-to-use interface that requires just a few mouse clicks and a model API,\nusers can conduct a thorough evaluation with minimal coding. Large-scale\nexperiments featuring 23 influential Chinese LLMs have validated CLEVA's\nefficacy.\n","authors":["Yanyang Li","Jianqiao Zhao","Duo Zheng","Zi-Yuan Hu","Zhi Chen","Xiaohui Su","Yongfeng Huang","Shijia Huang","Dahua Lin","Michael R. Lyu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04811v1","updated":"2023-08-09T09:09:17Z","published":"2023-08-09T09:09:17Z","title":"A Bipartite Graph is All We Need for Enhancing Emotional Reasoning with\n  Commonsense Knowledge","summary":"  The context-aware emotional reasoning ability of AI systems, especially in\nconversations, is of vital importance in applications such as online opinion\nmining from social media and empathetic dialogue systems. Due to the implicit\nnature of conveying emotions in many scenarios, commonsense knowledge is widely\nutilized to enrich utterance semantics and enhance conversation modeling.\nHowever, most previous knowledge infusion methods perform empirical knowledge\nfiltering and design highly customized architectures for knowledge interaction\nwith the utterances, which can discard useful knowledge aspects and limit their\ngeneralizability to different knowledge sources. Based on these observations,\nwe propose a Bipartite Heterogeneous Graph (BHG) method for enhancing emotional\nreasoning with commonsense knowledge. In BHG, the extracted context-aware\nutterance representations and knowledge representations are modeled as\nheterogeneous nodes. Two more knowledge aggregation node types are proposed to\nperform automatic knowledge filtering and interaction. BHG-based knowledge\ninfusion can be directly generalized to multi-type and multi-grained knowledge\nsources. In addition, we propose a Multi-dimensional Heterogeneous Graph\nTransformer (MHGT) to perform graph reasoning, which can retain unchanged\nfeature spaces and unequal dimensions for heterogeneous node types during\ninference to prevent unnecessary loss of information. Experiments show that\nBHG-based methods significantly outperform state-of-the-art knowledge infusion\nmethods and show generalized knowledge infusion ability with higher efficiency.\nFurther analysis proves that previous empirical knowledge filtering methods do\nnot guarantee to provide the most useful knowledge information. Our code is\navailable at: https://github.com/SteveKGYang/BHG.\n","authors":["Kailai Yang","Tianlin Zhang","Shaoxiong Ji","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2308.04811v1.pdf","comment":"Accepted by CIKM 2023 as a long paper"},{"id":"http://arxiv.org/abs/2308.02582v3","updated":"2023-08-09T09:03:12Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Harshit Nigam","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v3.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2303.17408v2","updated":"2023-08-09T08:58:25Z","published":"2023-03-30T14:25:44Z","title":"A Prompt-based Multimodal Tabular Transformer Encoder For Medical\n  Intervention Duration Estimation","summary":"  Objective: This study focuses on estimating the duration of medical\ninterventions using electronic health records (EHRs) in clinical decision\nsupport. Most existing models were designed for structured tabular data only\nand often suffer from data corruption problem. The unstructured clinical\nfree-text data that provides valuable insights and is more resistant to data\ncorruption is often overlooked. The objective of this research is to develop a\nmultimodal deep learning framework that integrates different data modalities\nfrom EHRs, thereby fully utilizing the predictive capability of EHRs for\nmedical intervention estimation.\n  Materials and Methods: A novel prompt-based tabular transformer encoder\nframework is proposed for medical intervention duration estimation based on\nmultimodal EHR data. The framework leverages a pre-trained sentence encoder\nwith medical prompts to harmonize language representations of various clinical\ndata modalities, which a tabular transformer encoder is developed to further\nexplore.\n  Results: The developed model demonstrates superior performance compared to\nthe baselines in two EHR datasets. Furthermore, the model exhibits resilience\nto data corruption in EHRs, with the RMSE curve increasing gradually with\nhigher corruption rates.\n  Discussion: Other than the predictive effectiveness and robustness of the\nproposed framework, the ablation study highlights the significance of critical\ncomponents, such as medical prompts, free-text information, and the pre-trained\nsentence encoder, all contributing to the model's predictive ability.\n  Conclusion: This research presents a promising pathway to enhance medical\nintervention estimation by incorporating diverse data modalities from language\nperspective, ultimately bolstering the reliability of deep learning models in\nclinical care.\n","authors":["Yucheng Ruan","Xiang Lan","Daniel J. Tan","Hairil Rizal Abdullah","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2303.17408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v4","updated":"2023-08-09T08:53:08Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04800v1","updated":"2023-08-09T08:46:39Z","published":"2023-08-09T08:46:39Z","title":"ADMUS: A Progressive Question Answering Framework Adaptable to Multiple\n  Knowledge Sources","summary":"  With the introduction of deep learning models, semantic parsingbased\nknowledge base question answering (KBQA) systems have achieved high performance\nin handling complex questions. However, most existing approaches primarily\nfocus on enhancing the model's effectiveness on individual benchmark datasets,\ndisregarding the high costs of adapting the system to disparate datasets in\nreal-world scenarios (e.g., multi-tenant platform). Therefore, we present\nADMUS, a progressive knowledge base question answering framework designed to\naccommodate a wide variety of datasets, including multiple languages, diverse\nbackbone knowledge bases, and disparate question answering datasets. To\naccomplish the purpose, we decouple the architecture of conventional KBQA\nsystems and propose this dataset-independent framework. Our framework supports\nthe seamless integration of new datasets with minimal effort, only requiring\ncreating a dataset-related micro-service at a negligible cost. To enhance the\nusability of ADUMS, we design a progressive framework consisting of three\nstages, ranges from executing exact queries, generating approximate queries and\nretrieving open-domain knowledge referring from large language models. An\nonline demonstration of ADUMS is available at:\nhttps://answer.gstore.cn/pc/index.html\n","authors":["Yirui Zhan","Yanzeng Li","Minhao Zhang","Lei Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04763v1","updated":"2023-08-09T07:51:40Z","published":"2023-08-09T07:51:40Z","title":"Automatically measuring speech fluency in people with aphasia: first\n  achievements using read-speech data","summary":"  Background: Speech and language pathologists (SLPs) often relyon judgements\nof speech fluency for diagnosing or monitoringpatients with aphasia. However,\nsuch subjective methods havebeen criticised for their lack of reliability and\ntheir clinical cost interms of time. Aims: This study aims at assessing the\nrelevance of a signalprocessingalgorithm, initially developed in the field of\nlanguage acquisition, for the automatic measurement of speech fluency in people\nwith aphasia (PWA). Methods & Procedures: Twenty-nine PWA and five control\nparticipantswere recruited via non-profit organizations and SLP networks. All\nparticipants were recorded while reading out loud a set ofsentences taken from\nthe French version of the Boston Diagnostic Aphasia Examination. Three trained\nSLPs assessed the fluency of each sentence on a five-point qualitative scale. A\nforward-backward divergence segmentation and a clustering algorithm were used\nto compute, for each sentence, four automatic predictors of speech fluency:\npseudo-syllable rate, speech ratio, rate of silent breaks, and standard\ndeviation of pseudo-syllable length. The four predictors were finally combined\ninto multivariate regression models (a multiplelinear regression - MLR, and two\nnon-linear models) to predict the average SLP ratings of speech fluency, using\na leave-one speaker-out validation scheme. Outcomes & Results: All models\nachieved accurate predictions of speech fluency ratings, with average\nroot-mean-square errors as low as 0.5. The MLR yielded a correlation\ncoefficient of 0.87 with reference ratings at the sentence level, and of 0.93\nwhen aggregating the data for each participant. The inclusion of an additional\npredictor sensitive to repetitions improved further the predictions with a\ncorrelation coefficient of 0.91 at the sentence level, and of 0.96 at the\nparticipant level. Conclusions: The algorithms used in this study can\nconstitute a cost-effective and reliable tool for the assessment of the speech\nfluency of patients with aphasia in read-aloud tasks. Perspectives for the\nassessment of spontaneous speech are discussed.\n","authors":["Lionel Fontan","Typhanie Prince","Aleksandra Nowakowska","Halima Sahraoui","Silvia Martinez-Ferreiro"],"pdf_url":"https://arxiv.org/pdf/2308.04763v1.pdf","comment":"Aphasiology, (2023)"},{"id":"http://arxiv.org/abs/2308.04756v1","updated":"2023-08-09T07:47:17Z","published":"2023-08-09T07:47:17Z","title":"Building Interpretable and Reliable Open Information Retriever for New\n  Domains Overnight","summary":"  Information retrieval (IR) or knowledge retrieval, is a critical component\nfor many down-stream tasks such as open-domain question answering (QA). It is\nalso very challenging, as it requires succinctness, completeness, and\ncorrectness. In recent works, dense retrieval models have achieved\nstate-of-the-art (SOTA) performance on in-domain IR and QA benchmarks by\nrepresenting queries and knowledge passages with dense vectors and learning the\nlexical and semantic similarity. However, using single dense vectors and\nend-to-end supervision are not always optimal because queries may require\nattention to multiple aspects and event implicit knowledge. In this work, we\npropose an information retrieval pipeline that uses entity/event linking model\nand query decomposition model to focus more accurately on different information\nunits of the query. We show that, while being more interpretable and reliable,\nour proposed pipeline significantly improves passage coverages and denotation\naccuracies across five IR and QA benchmarks. It will be the go-to system to use\nfor applications that need to perform IR on a new domain without much dedicated\neffort, because of its superior interpretability and cross-domain performance.\n","authors":["Xiaodong Yu","Ben Zhou","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04756v1.pdf","comment":"Submission of ACL 2023. Rejected"},{"id":"http://arxiv.org/abs/2307.06713v3","updated":"2023-08-09T07:40:43Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne","Luciana Ferrer","Matías Vera","Pablo Piantanida"],"pdf_url":"https://arxiv.org/pdf/2307.06713v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03210v2","updated":"2023-08-09T06:24:55Z","published":"2023-05-04T23:46:49Z","title":"AttentionViz: A Global View of Transformer Attention","summary":"  Transformer models are revolutionizing machine learning, but their inner\nworkings remain mysterious. In this work, we present a new visualization\ntechnique designed to help researchers understand the self-attention mechanism\nin transformers that allows these models to learn rich, contextual\nrelationships between elements of a sequence. The main idea behind our method\nis to visualize a joint embedding of the query and key vectors used by\ntransformer models to compute attention. Unlike previous attention\nvisualization techniques, our approach enables the analysis of global patterns\nacross multiple input sequences. We create an interactive visualization tool,\nAttentionViz (demo: http://attentionviz.com), based on these joint query-key\nembeddings, and use it to study attention mechanisms in both language and\nvision transformers. We demonstrate the utility of our approach in improving\nmodel understanding and offering new insights about query-key interactions\nthrough several application scenarios and expert feedback.\n","authors":["Catherine Yeh","Yida Chen","Aoyu Wu","Cynthia Chen","Fernanda Viégas","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2305.03210v2.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2303.16839v3","updated":"2023-08-09T05:39:34Z","published":"2023-03-29T16:42:30Z","title":"MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks","summary":"  The development of language models have moved from encoder-decoder to\ndecoder-only designs. In addition, we observe that the two most popular\nmultimodal tasks, the generative and contrastive tasks, are nontrivial to\naccommodate in one architecture, and further need adaptations for downstream\ntasks. We propose a novel paradigm of training with a decoder-only model for\nmultimodal tasks, which is surprisingly effective in jointly learning of these\ndisparate vision-language tasks. This is done with a simple model, called\nMaMMUT. It consists of a single vision encoder and a text decoder, and is able\nto accommodate contrastive and generative learning by a novel two-pass approach\non the text decoder. We demonstrate that joint learning of these diverse\nobjectives is simple, effective, and maximizes the weight-sharing of the model\nacross these tasks. Furthermore, the same architecture enables straightforward\nextensions to open-vocabulary object detection and video-language tasks. The\nmodel tackles a diverse range of tasks, while being modest in capacity. Our\nmodel achieves the state of the art on image-text and text-image retrieval,\nvideo question answering and open-vocabulary detection tasks, outperforming\nmuch larger and more extensively trained foundational models. It shows very\ncompetitive results on VQA and Video Captioning, especially considering its\ncapacity. Ablations confirm the flexibility and advantages of our approach.\n","authors":["Weicheng Kuo","AJ Piergiovanni","Dahun Kim","Xiyang Luo","Ben Caine","Wei Li","Abhijit Ogale","Luowei Zhou","Andrew Dai","Zhifeng Chen","Claire Cui","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2303.16839v3.pdf","comment":"Published in Transactions on Machine Learning Research (\n  https://jmlr.org/tmlr/ ). 18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04712v1","updated":"2023-08-09T05:08:57Z","published":"2023-08-09T05:08:57Z","title":"Slot Induction via Pre-trained Language Model Probing and Multi-level\n  Contrastive Learning","summary":"  Recent advanced methods in Natural Language Understanding for Task-oriented\nDialogue (TOD) Systems (e.g., intent detection and slot filling) require a\nlarge amount of annotated data to achieve competitive performance. In reality,\ntoken-level annotations (slot labels) are time-consuming and difficult to\nacquire. In this work, we study the Slot Induction (SI) task whose objective is\nto induce slot boundaries without explicit knowledge of token-level slot\nannotations. We propose leveraging Unsupervised Pre-trained Language Model\n(PLM) Probing and Contrastive Learning mechanism to exploit (1) unsupervised\nsemantic knowledge extracted from PLM, and (2) additional sentence-level intent\nlabel signals available from TOD. Our approach is shown to be effective in SI\ntask and capable of bridging the gaps with token-level supervised models on two\nNLU benchmark datasets. When generalized to emerging intents, our SI objectives\nalso provide enhanced slot label representations, leading to improved\nperformance on the Slot Filling tasks.\n","authors":["Hoang H. Nguyen","Chenwei Zhang","Ye Liu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.04712v1.pdf","comment":"Accepted at SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2308.04711v1","updated":"2023-08-09T05:06:39Z","published":"2023-08-09T05:06:39Z","title":"Answering Unseen Questions With Smaller Language\\\\Models Using Rationale\n  Generation and Dense Retrieval","summary":"  When provided with sufficient explanatory context, smaller Language Models\nhave been shown to exhibit strong reasoning ability on challenging short-answer\nquestion-answering tasks where the questions are unseen in training. We\nevaluate two methods for further improvement in this setting. Both methods\nfocus on combining rationales generated by a larger Language Model with longer\ncontexts created from a multi-hop dense retrieval system. The first method\n($\\textit{RR}$) involves training a Rationale Ranking model to score both\ngenerated rationales and retrieved contexts with respect to relevance and\ntruthfulness. We then use the scores to derive combined contexts from both\nknowledge sources using a number of combinatory strategies. For the second\nmethod ($\\textit{RATD}$) we train a smaller Reasoning model using\nretrieval-augmented training datasets such that it becomes proficient at\nutilising relevant information from longer text sequences that may be only\npartially evidential and frequently contain many irrelevant sentences.\nGenerally we find that both methods are effective but that the $\\textit{RATD}$\nmethod is more straightforward to apply and produces the strongest results in\nthe unseen setting on which we focus. Our single best Reasoning model using\nonly 440 million parameters materially improves upon strong comparable prior\nbaselines for unseen evaluation datasets (StrategyQA 58.9 $\\rightarrow$ 61.7\nacc., CommonsenseQA 63.6 $\\rightarrow$ 72.7 acc., ARC-DA 31.6 $\\rightarrow$\n52.1 F1, IIRC 25.5 $\\rightarrow$ 27.3 F1) and a version utilising our prior\nknowledge of each type of question in selecting a context combination strategy\ndoes even better. Our proposed models also generally outperform direct prompts\nagainst much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot\nchain-of-thought and few-shot answer-only settings.\n","authors":["Tim Hartill","Diana Benavides-Prado","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.04711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04709v1","updated":"2023-08-09T05:01:28Z","published":"2023-08-09T05:01:28Z","title":"A Comparative Study of Open-Source Large Language Models, GPT-4 and\n  Claude 2: Multiple-Choice Test Taking in Nephrology","summary":"  In recent years, there have been significant breakthroughs in the field of\nnatural language processing, particularly with the development of large\nlanguage models (LLMs). These LLMs have showcased remarkable capabilities on\nvarious benchmarks. In the healthcare field, the exact role LLMs and other\nfuture AI models will play remains unclear. There is a potential for these\nmodels in the future to be used as part of adaptive physician training, medical\nco-pilot applications, and digital patient interaction scenarios. The ability\nof AI models to participate in medical training and patient care will depend in\npart on their mastery of the knowledge content of specific medical fields. This\nstudy investigated the medical knowledge capability of LLMs, specifically in\nthe context of internal medicine subspecialty multiple-choice test-taking\nability. We compared the performance of several open-source LLMs (Koala 7B,\nFalcon 7B, Stable-Vicuna 13B, and Orca Mini 13B), to GPT-4 and Claude 2 on\nmultiple-choice questions in the field of Nephrology. Nephrology was chosen as\nan example of a particularly conceptually complex subspecialty field within\ninternal medicine. The study was conducted to evaluate the ability of LLM\nmodels to provide correct answers to nephSAP (Nephrology Self-Assessment\nProgram) multiple-choice questions. The overall success of open-sourced LLMs in\nanswering the 858 nephSAP multiple-choice questions correctly was 17.1% -\n25.5%. In contrast, Claude 2 answered 54.4% of the questions correctly, whereas\nGPT-4 achieved a score of 73.3%. We show that current widely used open-sourced\nLLMs do poorly in their ability for zero-shot reasoning when compared to GPT-4\nand Claude 2. The findings of this study potentially have significant\nimplications for the future of subspecialty medical training and patient care.\n","authors":["Sean Wu","Michael Koo","Lesley Blum","Andy Black","Liyo Kao","Fabien Scalzo","Ira Kurtz"],"pdf_url":"https://arxiv.org/pdf/2308.04709v1.pdf","comment":"7 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.04688v1","updated":"2023-08-09T03:50:26Z","published":"2023-08-09T03:50:26Z","title":"Generating News-Centric Crossword Puzzles As A Constraint Satisfaction\n  and Optimization Problem","summary":"  Crossword puzzles have traditionally served not only as entertainment but\nalso as an educational tool that can be used to acquire vocabulary and language\nproficiency. One strategy to enhance the educational purpose is\npersonalization, such as including more words on a particular topic. This paper\nfocuses on the case of encouraging people's interest in news and proposes a\nframework for automatically generating news-centric crossword puzzles. We\ndesigned possible scenarios and built a prototype as a constraint satisfaction\nand optimization problem, that is, containing as many news-derived words as\npossible. Our experiments reported the generation probabilities and time\nrequired under several conditions. The results showed that news-centric\ncrossword puzzles can be generated even with few news-derived words. We\nsummarize the current issues and future research directions through a\nqualitative evaluation of the prototype. This is the first proposal that a\nformulation of a constraint satisfaction and optimization problem can be\nbeneficial as an educational application.\n","authors":["Kaito Majima","Shotaro Ishihara"],"pdf_url":"https://arxiv.org/pdf/2308.04688v1.pdf","comment":"32nd ACM International Conference on Information and Knowledge\n  Management (short paper track)"},{"id":"http://arxiv.org/abs/2308.04679v1","updated":"2023-08-09T03:18:07Z","published":"2023-08-09T03:18:07Z","title":"Sci-CoT: Leveraging Large Language Models for Enhanced Knowledge\n  Distillation in Small Models for Scientific QA","summary":"  Large Language Models (LLMs) have shown outstanding performance across wide\nrange of downstream tasks. This competency is attributed to their substantial\nparameter size and pre-training on extensive corpus. Moreover, LLMs have\nexhibited enhanced reasoning capabilities in tackling complex reasoning tasks,\nowing to the utilization of a method named ``Chain-of-Thought (CoT)\nprompting''. This method is designed to generate intermediate reasoning steps\nthat guide the inference of the final answer. However, it is essential to\nhighlight that these advanced reasoning abilities appear to emerge in models\nwith a minimum of 10 billion parameters, thereby limiting its efficacy in\nsituations where computational resources are constrained. In this paper, we\ninvestigate the possibility of transferring the reasoning capabilities of LLMs\nto smaller models via knowledge distillation. Specifically, we propose Sci-CoT,\na two-stage framework that separates the processes of generating rationales and\ninferring answers. This method enables a more efficient use of rationales\nduring the answer inference stage, leading to improved performance on\nscientific question-answering tasks. Utilizing Sci-CoT, our 80-million\nparameter model is able to exceed the performance of BLOOM-176B in the ARC-Easy\ndataset under the few shot setting.\n","authors":["Yuhan Ma","Haiqi Jiang","Chenyou Fan"],"pdf_url":"https://arxiv.org/pdf/2308.04679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04665v1","updated":"2023-08-09T02:12:04Z","published":"2023-08-09T02:12:04Z","title":"Sudowoodo: a Chinese Lyric Imitation System with Source Lyrics","summary":"  Lyrics generation is a well-known application in natural language generation\nresearch, with several previous studies focusing on generating accurate lyrics\nusing precise control such as keywords, rhymes, etc. However, lyrics imitation,\nwhich involves writing new lyrics by imitating the style and content of the\nsource lyrics, remains a challenging task due to the lack of a parallel corpus.\nIn this paper, we introduce \\textbf{\\textit{Sudowoodo}}, a Chinese lyrics\nimitation system that can generate new lyrics based on the text of source\nlyrics. To address the issue of lacking a parallel training corpus for lyrics\nimitation, we propose a novel framework to construct a parallel corpus based on\na keyword-based lyrics model from source lyrics. Then the pairs \\textit{(new\nlyrics, source lyrics)} are used to train the lyrics imitation model. During\nthe inference process, we utilize a post-processing module to filter and rank\nthe generated lyrics, selecting the highest-quality ones. We incorporated audio\ninformation and aligned the lyrics with the audio to form the songs as a bonus.\nThe human evaluation results show that our framework can perform better lyric\nimitation. Meanwhile, the \\textit{Sudowoodo} system and demo video of the\nsystem is available at\n\\href{https://Sudowoodo.apps-hp.danlu.netease.com/}{Sudowoodo} and\n\\href{https://youtu.be/u5BBT_j1L5M}{https://youtu.be/u5BBT\\_j1L5M}.\n","authors":["Yongzhu Chang","Rongsheng Zhang","Lin Jiang","Qihang Chen","Le Zhang","Jiashu Pu"],"pdf_url":"https://arxiv.org/pdf/2308.04665v1.pdf","comment":"7 pages,3 figures, submit to emnlp 2023 demo track"},{"id":"http://arxiv.org/abs/2306.04187v2","updated":"2023-08-09T01:56:01Z","published":"2023-06-07T06:46:56Z","title":"Knowing-how & Knowing-that: A New Task for Machine Comprehension of User\n  Manuals","summary":"  The machine reading comprehension (MRC) of user manuals has huge potential in\ncustomer service. However, current methods have trouble answering complex\nquestions. Therefore, we introduce the Knowing-how & Knowing-that task that\nrequires the model to answer factoid-style, procedure-style, and inconsistent\nquestions about user manuals. We resolve this task by jointly representing the\nsteps and facts in a graph TARA, which supports a unified inference of various\nquestions. Towards a systematical benchmarking study, we design a heuristic\nmethod to automatically parse user manuals into TARAs and build an annotated\ndataset to test the model's ability in answering real-world questions.\nEmpirical results demonstrate that representing user manuals as TARAs is a\ndesired solution for the MRC of user manuals. An in-depth investigation of TARA\nfurther sheds light on the issues and broader impacts of future representations\nof user manuals. We hope our work can move the MRC of user manuals to a more\ncomplex and realistic stage.\n","authors":["Hongru Liang","Jia Liu","Weihong Du","Dingnan Jin","Wenqiang Lei","Zujie Wen","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2306.04187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04645v1","updated":"2023-08-09T01:02:06Z","published":"2023-08-09T01:02:06Z","title":"Cross-Lingual Constituency Parsing for Middle High German: A\n  Delexicalized Approach","summary":"  Constituency parsing plays a fundamental role in advancing natural language\nprocessing (NLP) tasks. However, training an automatic syntactic analysis\nsystem for ancient languages solely relying on annotated parse data is a\nformidable task due to the inherent challenges in building treebanks for such\nlanguages. It demands extensive linguistic expertise, leading to a scarcity of\navailable resources. To overcome this hurdle, cross-lingual transfer techniques\nwhich require minimal or even no annotated data for low-resource target\nlanguages offer a promising solution. In this study, we focus on building a\nconstituency parser for $\\mathbf{M}$iddle $\\mathbf{H}$igh $\\mathbf{G}$erman\n$\\mathbf{MHG}$ under realistic conditions, where no annotated MHG treebank is\navailable for training. In our approach, we leverage the linguistic continuity\nand structural similarity between MHG and $\\mathbf{M}$odern $\\mathbf{G}$erman\n$\\mathbf{MG}$, along with the abundance of MG treebank resources. Specifically,\nby employing the $\\mathit{delexicalization}$ method, we train a constituency\nparser on MG parse datasets and perform cross-lingual transfer to MHG parsing.\nOur delexicalized constituency parser demonstrates remarkable performance on\nthe MHG test set, achieving an F1-score of 67.3%. It outperforms the best\nzero-shot cross-lingual baseline by a margin of 28.6% points. These encouraging\nresults underscore the practicality and potential for automatic syntactic\nanalysis in other ancient languages that face similar challenges as MHG.\n","authors":["Ercong Nie","Helmut Schmid","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2308.04645v1.pdf","comment":"Accepted to ALP 2023"},{"id":"http://arxiv.org/abs/2307.06775v2","updated":"2023-08-09T00:30:31Z","published":"2023-07-06T16:04:46Z","title":"A Novel Site-Agnostic Multimodal Deep Learning Model to Identify\n  Pro-Eating Disorder Content on Social Media","summary":"  Over the last decade, there has been a vast increase in eating disorder\ndiagnoses and eating disorder-attributed deaths, reaching their zenith during\nthe Covid-19 pandemic. This immense growth derived in part from the stressors\nof the pandemic but also from increased exposure to social media, which is rife\nwith content that promotes eating disorders. This study aimed to create a\nmultimodal deep learning model that can determine if a given social media post\npromotes eating disorders based on a combination of visual and textual data. A\nlabeled dataset of Tweets was collected from Twitter, upon which twelve deep\nlearning models were trained and tested. Based on model performance, the most\neffective deep learning model was the multimodal fusion of the RoBERTa natural\nlanguage processing model and the MaxViT image classification model, attaining\naccuracy and F1 scores of 95.9% and 0.959, respectively. The RoBERTa and MaxViT\nfusion model, deployed to classify an unlabeled dataset of posts from the\nsocial media sites Tumblr and Reddit, generated results akin to those of\nprevious research studies that did not employ artificial intelligence-based\ntechniques, indicating that deep learning models can develop insights congruent\nto those of researchers. Additionally, the model was used to conduct a\ntimeseries analysis of yet unseen Tweets from eight Twitter hashtags,\nuncovering that, since 2014, the relative abundance of content that promotes\neating disorders has decreased drastically within those communities. Despite\nthis reduction, by 2018, content that promotes eating disorders had either\nstopped declining or increased in ampleness anew on these hashtags.\n","authors":["Jonathan Feldman"],"pdf_url":"https://arxiv.org/pdf/2307.06775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15644v2","updated":"2023-08-09T23:51:22Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2212.01944v5","updated":"2023-08-09T21:55:36Z","published":"2022-12-04T22:34:16Z","title":"Automaton-Based Representations of Task Knowledge from Generative\n  Language Models","summary":"  Automaton-based representations of task knowledge play an important role in\ncontrol and planning for sequential decision-making problems. However,\nobtaining the high-level task knowledge required to build such automata is\noften difficult. Meanwhile, large-scale generative language models (GLMs) can\nautomatically generate relevant task knowledge. However, the textual outputs\nfrom GLMs cannot be formally verified or used for sequential decision-making.\nWe propose a novel algorithm named GLM2FSA, which constructs a finite state\nautomaton (FSA) encoding high-level task knowledge from a brief\nnatural-language description of the task goal. GLM2FSA first sends queries to a\nGLM to extract task knowledge in textual form, and then it builds an FSA to\nrepresent this text-based knowledge. The proposed algorithm thus fills the gap\nbetween natural-language task descriptions and automaton-based representations,\nand the constructed FSA can be formally verified against user-defined\nspecifications. We accordingly propose a method to iteratively refine the\nqueries to the GLM based on the outcomes, e.g., counter-examples, from\nverification. We demonstrate GLM2FSA's ability to build and refine\nautomaton-based representations of everyday tasks (e.g., crossing a road), and\nalso of tasks that require highly-specialized knowledge (e.g., executing secure\nmulti-party computation).\n","authors":["Yunhao Yang","Jean-Raphaël Gaglione","Cyrus Neary","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2212.01944v5.pdf","comment":"Submitted to JAIR"},{"id":"http://arxiv.org/abs/2308.05219v1","updated":"2023-08-09T20:53:22Z","published":"2023-08-09T20:53:22Z","title":"Decoding Layer Saliency in Language Transformers","summary":"  In this paper, we introduce a strategy for identifying textual saliency in\nlarge-scale language models applied to classification tasks. In visual networks\nwhere saliency is more well-studied, saliency is naturally localized through\nthe convolutional layers of the network; however, the same is not true in\nmodern transformer-stack networks used to process natural language. We adapt\ngradient-based saliency methods for these networks, propose a method for\nevaluating the degree of semantic coherence of each layer, and demonstrate\nconsistent improvement over numerous other methods for textual saliency on\nmultiple benchmark classification datasets. Our approach requires no additional\ntraining or access to labelled data, and is comparatively very computationally\nefficient.\n","authors":["Elizabeth M. Hou","Gregory Castanon"],"pdf_url":"https://arxiv.org/pdf/2308.05219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13817v4","updated":"2023-08-09T16:17:08Z","published":"2023-02-27T14:26:29Z","title":"Let's have a chat! A Conversation with ChatGPT: Technology,\n  Applications, and Limitations","summary":"  The emergence of an AI-powered chatbot that can generate human-like sentences\nand write coherent essays has caught the world's attention. This paper\ndiscusses the historical overview of chatbots and the technology behind Chat\nGenerative Pre-trained Transformer, better known as ChatGPT. Moreover,\npotential applications of ChatGPT in various domains, including healthcare,\neducation, and research, are highlighted. Despite promising results, there are\nseveral privacy and ethical concerns surrounding ChatGPT. In addition, we\nhighlight some of the important limitations of the current version of ChatGPT.\nWe also ask ChatGPT to provide its point of view and present its responses to\nseveral questions we attempt to answer.\n","authors":["Sakib Shahriar","Kadhim Hayawi"],"pdf_url":"https://arxiv.org/pdf/2302.13817v4.pdf","comment":"This manuscript has been accepted by Artificial Intelligence and\n  Applications (AIA, ISSN: 2811-0854),\n  https://doi.org/10.47852/bonviewAIA3202939, 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.05104v1","updated":"2023-08-09T17:55:50Z","published":"2023-08-09T17:55:50Z","title":"Scene-Generalizable Interactive Segmentation of Radiance Fields","summary":"  Existing methods for interactive segmentation in radiance fields entail\nscene-specific optimization and thus cannot generalize across different scenes,\nwhich greatly limits their applicability. In this work we make the first\nattempt at Scene-Generalizable Interactive Segmentation in Radiance Fields\n(SGISRF) and propose a novel SGISRF method, which can perform 3D object\nsegmentation for novel (unseen) scenes represented by radiance fields, guided\nby only a few interactive user clicks in a given set of multi-view 2D images.\nIn particular, the proposed SGISRF focuses on addressing three crucial\nchallenges with three specially designed techniques. First, we devise the\nCross-Dimension Guidance Propagation to encode the scarce 2D user clicks into\ninformative 3D guidance representations. Second, the Uncertainty-Eliminated 3D\nSegmentation module is designed to achieve efficient yet effective 3D\nsegmentation. Third, Concealment-Revealed Supervised Learning scheme is\nproposed to reveal and correct the concealed 3D segmentation errors resulted\nfrom the supervision in 2D space with only 2D mask annotations. Extensive\nexperiments on two real-world challenging benchmarks covering diverse scenes\ndemonstrate 1) effectiveness and scene-generalizability of the proposed method,\n2) favorable performance compared to classical method requiring scene-specific\noptimization.\n","authors":["Songlin Tang","Wenjie Pei","Xin Tao","Tanghui Jia","Guangming Lu","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2308.05104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05095v1","updated":"2023-08-09T17:45:04Z","published":"2023-08-09T17:45:04Z","title":"LayoutLLM-T2I: Eliciting Layout Guidance from LLM for Text-to-Image\n  Generation","summary":"  In the text-to-image generation field, recent remarkable progress in Stable\nDiffusion makes it possible to generate rich kinds of novel photorealistic\nimages. However, current models still face misalignment issues (e.g.,\nproblematic spatial relation understanding and numeration failure) in complex\nnatural scenes, which impedes the high-faithfulness text-to-image generation.\nAlthough recent efforts have been made to improve controllability by giving\nfine-grained guidance (e.g., sketch and scribbles), this issue has not been\nfundamentally tackled since users have to provide such guidance information\nmanually. In this work, we strive to synthesize high-fidelity images that are\nsemantically aligned with a given textual prompt without any guidance. Toward\nthis end, we propose a coarse-to-fine paradigm to achieve layout planning and\nimage generation. Concretely, we first generate the coarse-grained layout\nconditioned on a given textual prompt via in-context learning based on Large\nLanguage Models. Afterward, we propose a fine-grained object-interaction\ndiffusion method to synthesize high-faithfulness images conditioned on the\nprompt and the automatically generated layout. Extensive experiments\ndemonstrate that our proposed method outperforms the state-of-the-art models in\nterms of layout and image generation. Our code and settings are available at\n\\url{https://layoutllm-t2i.github.io}.\n","authors":["Leigang Qu","Shengqiong Wu","Hao Fei","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.05095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05092v1","updated":"2023-08-09T17:40:12Z","published":"2023-08-09T17:40:12Z","title":"A degree of image identification at sub-human scales could be possible\n  with more advanced clusters","summary":"  The purpose of the research is to determine if currently available\nself-supervised learning techniques can accomplish human level comprehension of\nvisual images using the same degree and amount of sensory input that people\nacquire from. Initial research on this topic solely considered data volume\nscaling. Here, we scale both the volume of data and the quality of the image.\nThis scaling experiment is a self-supervised learning method that may be done\nwithout any outside financing. We find that scaling up data volume and picture\nresolution at the same time enables human-level item detection performance at\nsub-human sizes.We run a scaling experiment with vision transformers trained on\nup to 200000 images up to 256 ppi.\n","authors":["Prateek Y J"],"pdf_url":"https://arxiv.org/pdf/2308.05092v1.pdf","comment":"6 pages, 5 figures, public code and model:\n  https://github.com/PrateekJannu/imagescale2"},{"id":"http://arxiv.org/abs/2303.05689v2","updated":"2023-08-09T17:31:20Z","published":"2023-03-10T03:44:01Z","title":"Inducing Neural Collapse to a Fixed Hierarchy-Aware Frame for Reducing\n  Mistake Severity","summary":"  There is a recently discovered and intriguing phenomenon called Neural\nCollapse: at the terminal phase of training a deep neural network for\nclassification, the within-class penultimate feature means and the associated\nclassifier vectors of all flat classes collapse to the vertices of a simplex\nEquiangular Tight Frame (ETF). Recent work has tried to exploit this phenomenon\nby fixing the related classifier weights to a pre-computed ETF to induce neural\ncollapse and maximize the separation of the learned features when training with\nimbalanced data. In this work, we propose to fix the linear classifier of a\ndeep neural network to a Hierarchy-Aware Frame (HAFrame), instead of an ETF,\nand use a cosine similarity-based auxiliary loss to learn hierarchy-aware\npenultimate features that collapse to the HAFrame. We demonstrate that our\napproach reduces the mistake severity of the model's predictions while\nmaintaining its top-1 accuracy on several datasets of varying scales with\nhierarchies of heights ranging from 3 to 12. Code:\nhttps://github.com/ltong1130ztr/HAFrame\n","authors":["Tong Liang","Jim Davis"],"pdf_url":"https://arxiv.org/pdf/2303.05689v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05081v1","updated":"2023-08-09T17:20:14Z","published":"2023-08-09T17:20:14Z","title":"Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic\n  Role Labeling","summary":"  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from\ngiven videos, by recognizing the predict-argument event structures and the\ninterrelationships between events. While recent endeavors have put forth\nmethods for VidSRL, they can be mostly subject to two key drawbacks, including\nthe lack of fine-grained spatial scene perception and the insufficiently\nmodeling of video temporality. Towards this end, this work explores a novel\nholistic spatio-temporal scene graph (namely HostSG) representation based on\nthe existing dynamic scene graph structures, which well model both the\nfine-grained spatial semantics and temporal dynamics of videos for VidSRL.\nBuilt upon the HostSG, we present a nichetargeting VidSRL framework. A\nscene-event mapping mechanism is first designed to bridge the gap between the\nunderlying scene structure and the high-level event semantic structure,\nresulting in an overall hierarchical scene-event (termed ICE) graph structure.\nWe further perform iterative structure refinement to optimize the ICE graph,\nsuch that the overall structure representation can best coincide with end task\ndemand. Finally, three subtask predictions of VidSRL are jointly decoded, where\nthe end-to-end paradigm effectively avoids error propagation. On the benchmark\ndataset, our framework boosts significantly over the current best-performing\nmodel. Further analyses are shown for a better understanding of the advances of\nour methods.\n","authors":["Yu Zhao","Hao Fei","Yixin Cao","Bobo Li","Meishan Zhang","Jianguo Wei","Min Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.05081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05731v2","updated":"2023-08-09T17:08:11Z","published":"2023-04-12T09:40:38Z","title":"SketchANIMAR: Sketch-based 3D Animal Fine-Grained Retrieval","summary":"  The retrieval of 3D objects has gained significant importance in recent years\ndue to its broad range of applications in computer vision, computer graphics,\nvirtual reality, and augmented reality. However, the retrieval of 3D objects\npresents significant challenges due to the intricate nature of 3D models, which\ncan vary in shape, size, and texture, and have numerous polygons and vertices.\nTo this end, we introduce a novel SHREC challenge track that focuses on\nretrieving relevant 3D animal models from a dataset using sketch queries and\nexpedites accessing 3D models through available sketches. Furthermore, a new\ndataset named ANIMAR was constructed in this study, comprising a collection of\n711 unique 3D animal models and 140 corresponding sketch queries. Our contest\nrequires participants to retrieve 3D models based on complex and detailed\nsketches. We receive satisfactory results from eight teams and 204 runs.\nAlthough further improvement is necessary, the proposed task has the potential\nto incentivize additional research in the domain of 3D object retrieval,\npotentially yielding benefits for a wide range of applications. We also provide\ninsights into potential areas of future research, such as improving techniques\nfor feature extraction and matching and creating more diverse datasets to\nevaluate retrieval performance. https://aichallenge.hcmus.edu.vn/sketchanimar\n","authors":["Trung-Nghia Le","Tam V. Nguyen","Minh-Quan Le","Trong-Thuan Nguyen","Viet-Tham Huynh","Trong-Le Do","Khanh-Duy Le","Mai-Khiem Tran","Nhat Hoang-Xuan","Thang-Long Nguyen-Ho","Vinh-Tiep Nguyen","Nhat-Quynh Le-Pham","Huu-Phuc Pham","Trong-Vu Hoang","Quang-Binh Nguyen","Trong-Hieu Nguyen-Mau","Tuan-Luc Huynh","Thanh-Danh Le","Ngoc-Linh Nguyen-Ha","Tuong-Vy Truong-Thuy","Truong Hoai Phong","Tuong-Nghiem Diep","Khanh-Duy Ho","Xuan-Hieu Nguyen","Thien-Phuc Tran","Tuan-Anh Yang","Kim-Phat Tran","Nhu-Vinh Hoang","Minh-Quang Nguyen","Hoai-Danh Vo","Minh-Hoa Doan","Hai-Dang Nguyen","Akihiro Sugimoto","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2304.05731v2.pdf","comment":"Accepted to Computers & Graphics (3DOR 2023, Journal track)"},{"id":"http://arxiv.org/abs/2308.05074v1","updated":"2023-08-09T17:07:20Z","published":"2023-08-09T17:07:20Z","title":"Drones4Good: Supporting Disaster Relief Through Remote Sensing and AI","summary":"  In order to respond effectively in the aftermath of a disaster, emergency\nservices and relief organizations rely on timely and accurate information about\nthe affected areas. Remote sensing has the potential to significantly reduce\nthe time and effort required to collect such information by enabling a rapid\nsurvey of large areas. To achieve this, the main challenge is the automatic\nextraction of relevant information from remotely sensed data. In this work, we\nshow how the combination of drone-based data with deep learning methods enables\nautomated and large-scale situation assessment. In addition, we demonstrate the\nintegration of onboard image processing techniques for the deployment of\nautonomous drone-based aid delivery. The results show the feasibility of a\nrapid and large-scale image analysis in the field, and that onboard image\nprocessing can increase the safety of drone-based aid deliveries.\n","authors":["Nina Merkle","Reza Bahmanyar","Corentin Henry","Seyed Majid Azimi","Xiangtian Yuan","Simon Schopferer","Veronika Gstaiger","Stefan Auer","Anne Schneibel","Marc Wieland","Thomas Kraft"],"pdf_url":"https://arxiv.org/pdf/2308.05074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11378v3","updated":"2023-08-09T17:06:52Z","published":"2022-07-22T23:48:26Z","title":"Do Perceptually Aligned Gradients Imply Adversarial Robustness?","summary":"  Adversarially robust classifiers possess a trait that non-robust models do\nnot -- Perceptually Aligned Gradients (PAG). Their gradients with respect to\nthe input align well with human perception. Several works have identified PAG\nas a byproduct of robust training, but none have considered it as a standalone\nphenomenon nor studied its own implications. In this work, we focus on this\ntrait and test whether \\emph{Perceptually Aligned Gradients imply Robustness}.\nTo this end, we develop a novel objective to directly promote PAG in training\nclassifiers and examine whether models with such gradients are more robust to\nadversarial attacks. Extensive experiments on multiple datasets and\narchitectures validate that models with aligned gradients exhibit significant\nrobustness, exposing the surprising bidirectional connection between PAG and\nrobustness. Lastly, we show that better gradient alignment leads to increased\nrobustness and harness this observation to boost the robustness of existing\nadversarial training techniques.\n","authors":["Roy Ganz","Bahjat Kawar","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2207.11378v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05070v1","updated":"2023-08-09T17:00:43Z","published":"2023-08-09T17:00:43Z","title":"Volumetric Fast Fourier Convolution for Detecting Ink on the Carbonized\n  Herculaneum Papyri","summary":"  Recent advancements in Digital Document Restoration (DDR) have led to\nsignificant breakthroughs in analyzing highly damaged written artifacts. Among\nthose, there has been an increasing interest in applying Artificial\nIntelligence techniques for virtually unwrapping and automatically detecting\nink on the Herculaneum papyri collection. This collection consists of\ncarbonized scrolls and fragments of documents, which have been digitized via\nX-ray tomography to allow the development of ad-hoc deep learning-based DDR\nsolutions. In this work, we propose a modification of the Fast Fourier\nConvolution operator for volumetric data and apply it in a segmentation\narchitecture for ink detection on the challenging Herculaneum papyri,\ndemonstrating its suitability via deep experimental analysis. To encourage the\nresearch on this task and the application of the proposed operator to other\ntasks involving volumetric data, we will release our implementation\n(https://github.com/aimagelab/vffc)\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2308.05070v1.pdf","comment":"Accepted at the 4th ICCV Workshop on e-Heritage (in conjunction with\n  ICCV 2023)"},{"id":"http://arxiv.org/abs/2308.05068v1","updated":"2023-08-09T16:58:03Z","published":"2023-08-09T16:58:03Z","title":"Geometric Learning-Based Transformer Network for Estimation of\n  Segmentation Errors","summary":"  Many segmentation networks have been proposed for 3D volumetric segmentation\nof tumors and organs at risk. Hospitals and clinical institutions seek to\naccelerate and minimize the efforts of specialists in image segmentation.\nStill, in case of errors generated by these networks, clinicians would have to\nmanually edit the generated segmentation maps. Given a 3D volume and its\nputative segmentation map, we propose an approach to identify and measure\nerroneous regions in the segmentation map. Our method can estimate error at any\npoint or node in a 3D mesh generated from a possibly erroneous volumetric\nsegmentation map, serving as a Quality Assurance tool. We propose a graph\nneural network-based transformer based on the Nodeformer architecture to\nmeasure and classify the segmentation errors at any point. We have evaluated\nour network on a high-resolution micro-CT dataset of the human inner-ear bony\nlabyrinth structure by simulating erroneous 3D segmentation maps. Our network\nincorporates a convolutional encoder to compute node-centric features from the\ninput micro-CT data, the Nodeformer to learn the latent graph embeddings, and a\nMulti-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our\nnetwork achieves a mean absolute error of ~0.042 over other Graph Neural\nNetworks (GNN) and an accuracy of 79.53% over other GNNs in estimating and\nclassifying the node-wise errors, respectively. We also put forth vertex-normal\nprediction as a custom pretext task for pre-training the CNN encoder to improve\nthe network's overall performance. Qualitative analysis shows the efficiency of\nour network in correctly classifying errors and reducing misclassifications.\n","authors":["Sneha Sree C","Mohammad Al Fahim","Keerthi Ram","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.05068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06053v2","updated":"2023-08-09T16:57:59Z","published":"2023-04-12T10:19:21Z","title":"TextANIMAR: Text-based 3D Animal Fine-Grained Retrieval","summary":"  3D object retrieval is an important yet challenging task that has drawn more\nand more attention in recent years. While existing approaches have made strides\nin addressing this issue, they are often limited to restricted settings such as\nimage and sketch queries, which are often unfriendly interactions for common\nusers. In order to overcome these limitations, this paper presents a novel\nSHREC challenge track focusing on text-based fine-grained retrieval of 3D\nanimal models. Unlike previous SHREC challenge tracks, the proposed task is\nconsiderably more challenging, requiring participants to develop innovative\napproaches to tackle the problem of text-based retrieval. Despite the increased\ndifficulty, we believe this task can potentially drive useful applications in\npractice and facilitate more intuitive interactions with 3D objects. Five\ngroups participated in our competition, submitting a total of 114 runs. While\nthe results obtained in our competition are satisfactory, we note that the\nchallenges presented by this task are far from fully solved. As such, we\nprovide insights into potential areas for future research and improvements. We\nbelieve we can help push the boundaries of 3D object retrieval and facilitate\nmore user-friendly interactions via vision-language technologies.\nhttps://aichallenge.hcmus.edu.vn/textanimar\n","authors":["Trung-Nghia Le","Tam V. Nguyen","Minh-Quan Le","Trong-Thuan Nguyen","Viet-Tham Huynh","Trong-Le Do","Khanh-Duy Le","Mai-Khiem Tran","Nhat Hoang-Xuan","Thang-Long Nguyen-Ho","Vinh-Tiep Nguyen","Tuong-Nghiem Diep","Khanh-Duy Ho","Xuan-Hieu Nguyen","Thien-Phuc Tran","Tuan-Anh Yang","Kim-Phat Tran","Nhu-Vinh Hoang","Minh-Quang Nguyen","E-Ro Nguyen","Minh-Khoi Nguyen-Nhat","Tuan-An To","Trung-Truc Huynh-Le","Nham-Tan Nguyen","Hoang-Chau Luong","Truong Hoai Phong","Nhat-Quynh Le-Pham","Huu-Phuc Pham","Trong-Vu Hoang","Quang-Binh Nguyen","Hai-Dang Nguyen","Akihiro Sugimoto","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2304.06053v2.pdf","comment":"Accepted to Computers and Graphics (3DOR, Journal Track)"},{"id":"http://arxiv.org/abs/2211.07898v2","updated":"2023-08-09T16:50:42Z","published":"2022-11-15T04:53:35Z","title":"Learning-Augmented Model-Based Planning for Visual Exploration","summary":"  We consider the problem of time-limited robotic exploration in previously\nunseen environments where exploration is limited by a predefined amount of\ntime. We propose a novel exploration approach using learning-augmented\nmodel-based planning. We generate a set of subgoals associated with frontiers\non the current map and derive a Bellman Equation for exploration with these\nsubgoals. Visual sensing and advances in semantic mapping of indoor scenes are\nexploited for training a deep convolutional neural network to estimate\nproperties associated with each frontier: the expected unobserved area beyond\nthe frontier and the expected timesteps (discretized actions) required to\nexplore it. The proposed model-based planner is guaranteed to explore the whole\nscene if time permits. We thoroughly evaluate our approach on a large-scale\npseudo-realistic indoor dataset (Matterport3D) with the Habitat simulator. We\ncompare our approach with classical and more recent RL-based exploration\nmethods. Our approach surpasses the greedy strategies by 2.1% and the RL-based\nexploration methods by 8.4% in terms of coverage.\n","authors":["Yimeng Li","Arnab Debnath","Gregory Stein","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2211.07898v2.pdf","comment":"Accepted to IROS 2023"},{"id":"http://arxiv.org/abs/2308.05059v1","updated":"2023-08-09T16:41:00Z","published":"2023-08-09T16:41:00Z","title":"A Novel Method for improving accuracy in neural network by reinstating\n  traditional back propagation technique","summary":"  Deep learning has revolutionized industries like computer vision, natural\nlanguage processing, and speech recognition. However, back propagation, the\nmain method for training deep neural networks, faces challenges like\ncomputational overhead and vanishing gradients. In this paper, we propose a\nnovel instant parameter update methodology that eliminates the need for\ncomputing gradients at each layer. Our approach accelerates learning, avoids\nthe vanishing gradient problem, and outperforms state-of-the-art methods on\nbenchmark data sets. This research presents a promising direction for efficient\nand effective deep neural network training.\n","authors":["Gokulprasath R"],"pdf_url":"https://arxiv.org/pdf/2308.05059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05051v1","updated":"2023-08-09T16:29:31Z","published":"2023-08-09T16:29:31Z","title":"PAT: Position-Aware Transformer for Dense Multi-Label Action Detection","summary":"  We present PAT, a transformer-based network that learns complex temporal\nco-occurrence action dependencies in a video by exploiting multi-scale temporal\nfeatures. In existing methods, the self-attention mechanism in transformers\nloses the temporal positional information, which is essential for robust action\ndetection. To address this issue, we (i) embed relative positional encoding in\nthe self-attention mechanism and (ii) exploit multi-scale temporal\nrelationships by designing a novel non hierarchical network, in contrast to the\nrecent transformer-based approaches that use a hierarchical structure. We argue\nthat joining the self-attention mechanism with multiple sub-sampling processes\nin the hierarchical approaches results in increased loss of positional\ninformation. We evaluate the performance of our proposed approach on two\nchallenging dense multi-label benchmark datasets, and show that PAT improves\nthe current state-of-the-art result by 1.1% and 0.6% mAP on the Charades and\nMultiTHUMOS datasets, respectively, thereby achieving the new state-of-the-art\nmAP at 26.5% and 44.6%, respectively. We also perform extensive ablation\nstudies to examine the impact of the different components of our proposed\nnetwork.\n","authors":["Faegheh Sardari","Armin Mustafa","Philip J. B. Jackson","Adrian Hilton"],"pdf_url":"https://arxiv.org/pdf/2308.05051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03936v2","updated":"2023-08-09T16:21:17Z","published":"2023-08-07T22:39:44Z","title":"ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the\n  Generalization of Histopathology Image Classification Across Unseen Hospitals","summary":"  We propose an exhaustive methodology that leverages all levels of feature\nabstraction, targeting an enhancement in the generalizability of image\nclassification to unobserved hospitals. Our approach incorporates\naugmentation-based self-supervision with common distribution shifts in\nhistopathology scenarios serving as the pretext task. This enables us to derive\ninvariant features from training images without relying on training labels,\nthereby covering different abstraction levels. Moving onto the subsequent\nabstraction level, we employ a domain alignment module to facilitate further\nextraction of invariant features across varying training hospitals. To\nrepresent the highly specific features of participating hospitals, an encoder\nis trained to classify hospital labels, independent of their diagnostic labels.\nThe features from each of these encoders are subsequently disentangled to\nminimize redundancy and segregate the features. This representation, which\nspans a broad spectrum of semantic information, enables the development of a\nmodel demonstrating increased robustness to unseen images from disparate\ndistributions. Experimental results from the PACS dataset (a domain\ngeneralization benchmark), a synthetic dataset created by applying\nhistopathology-specific jitters to the MHIST dataset (defining different\ndomains with varied distribution shifts), and a Renal Cell Carcinoma dataset\nderived from four image repositories from TCGA, collectively indicate that our\nproposed model is adept at managing varying levels of image granularity. Thus,\nit shows improved generalizability when faced with new, out-of-distribution\nhospital images.\n","authors":["Milad Sikaroudi","Maryam Hosseini","Shahryar Rahnamayan","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2308.03936v2.pdf","comment":"Accepted for publication at ICCV 2023, Computer Vision for Automated\n  Medical Diagnosis Workshop"},{"id":"http://arxiv.org/abs/2305.08992v2","updated":"2023-08-09T16:13:00Z","published":"2023-05-15T20:17:03Z","title":"The Brain Tumor Segmentation (BraTS) Challenge 2023: Local Synthesis of\n  Healthy Brain Tissue via Inpainting","summary":"  A myriad of algorithms for the automatic analysis of brain MR images is\navailable to support clinicians in their decision-making. For brain tumor\npatients, the image acquisition time series typically starts with a scan that\nis already pathological. This poses problems, as many algorithms are designed\nto analyze healthy brains and provide no guarantees for images featuring\nlesions. Examples include but are not limited to algorithms for brain anatomy\nparcellation, tissue segmentation, and brain extraction. To solve this dilemma,\nwe introduce the BraTS 2023 inpainting challenge. Here, the participants' task\nis to explore inpainting techniques to synthesize healthy brain scans from\nlesioned ones. The following manuscript contains the task formulation, dataset,\nand submission procedure. Later it will be updated to summarize the findings of\nthe challenge. The challenge is organized as part of the BraTS 2023 challenge\nhosted at the MICCAI 2023 conference in Vancouver, Canada.\n","authors":["Florian Kofler","Felix Meissen","Felix Steinbauer","Robert Graf","Eva Oswald","Ezequiel de da Rosa","Hongwei Bran Li","Ujjwal Baid","Florian Hoelzl","Oezguen Turgut","Izabela Horvath","Diana Waldmannstetter","Christina Bukas","Maruf Adewole","Syed Muhammad Anwar","Anastasia Janas","Anahita Fathi Kazerooni","Dominic LaBella","Ahmed W Moawad","Keyvan Farahani","James Eddy","Timothy Bergquist","Verena Chung","Russell Takeshi Shinohara","Farouk Dako","Walter Wiggins","Zachary Reitman","Chunhao Wang","Xinyang Liu","Zhifan Jiang","Ariana Familiar","Gian-Marco Conte","Elaine Johanson","Zeke Meier","Christos Davatzikos","John Freymann","Justin Kirby","Michel Bilello","Hassan M Fathallah-Shaykh","Roland Wiest","Jan Kirschke","Rivka R Colen","Aikaterini Kotrotsou","Pamela Lamontagne","Daniel Marcus","Mikhail Milchenko","Arash Nazeri","Marc-André Weber","Abhishek Mahajan","Suyash Mohan","John Mongan","Christopher Hess","Soonmee Cha","Javier Villanueva-Meyer","Errol Colak","Priscila Crivellaro","Andras Jakab","Jake Albrecht","Udunna Anazodo","Mariam Aboian","Juan Eugenio Iglesias","Koen Van Leemput","Spyridon Bakas","Daniel Rueckert","Benedikt Wiestler","Ivan Ezhov","Marie Piraud","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2305.08992v2.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2211.12860v5","updated":"2023-08-09T16:06:09Z","published":"2022-11-22T16:19:52Z","title":"DETRs with Collaborative Hybrid Assignments Training","summary":"  In this paper, we provide the observation that too few queries assigned as\npositive samples in DETR with one-to-one set matching leads to sparse\nsupervision on the encoder's output which considerably hurt the discriminative\nfeature learning of the encoder and vice visa for attention learning in the\ndecoder. To alleviate this, we present a novel collaborative hybrid assignments\ntraining scheme, namely $\\mathcal{C}$o-DETR, to learn more efficient and\neffective DETR-based detectors from versatile label assignment manners. This\nnew training scheme can easily enhance the encoder's learning ability in\nend-to-end detectors by training the multiple parallel auxiliary heads\nsupervised by one-to-many label assignments such as ATSS and Faster RCNN. In\naddition, we conduct extra customized positive queries by extracting the\npositive coordinates from these auxiliary heads to improve the training\nefficiency of positive samples in the decoder. In inference, these auxiliary\nheads are discarded and thus our method introduces no additional parameters and\ncomputational cost to the original detector while requiring no hand-crafted\nnon-maximum suppression (NMS). We conduct extensive experiments to evaluate the\neffectiveness of the proposed approach on DETR variants, including DAB-DETR,\nDeformable-DETR, and DINO-Deformable-DETR. The state-of-the-art\nDINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO\nval. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on\nCOCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear\nmargins with much fewer model sizes. Codes are available at\n\\url{https://github.com/Sense-X/Co-DETR}.\n","authors":["Zhuofan Zong","Guanglu Song","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2211.12860v5.pdf","comment":"ICCV 2023. Codes are available at https://github.com/Sense-X/Co-DETR"},{"id":"http://arxiv.org/abs/2308.03999v2","updated":"2023-08-09T15:59:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations Using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, demystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process.\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05032v1","updated":"2023-08-09T15:59:42Z","published":"2023-08-09T15:59:42Z","title":"Density Crop-guided Semi-supervised Object Detection in Aerial Images","summary":"  One of the important bottlenecks in training modern object detectors is the\nneed for labeled images where bounding box annotations have to be produced for\neach object present in the image. This bottleneck is further exacerbated in\naerial images where the annotators have to label small objects often\ndistributed in clusters on high-resolution images. In recent days, the\nmean-teacher approach trained with pseudo-labels and weak-strong augmentation\nconsistency is gaining popularity for semi-supervised object detection.\nHowever, a direct adaptation of such semi-supervised detectors for aerial\nimages where small clustered objects are often present, might not lead to\noptimal results. In this paper, we propose a density crop-guided\nsemi-supervised detector that identifies the cluster of small objects during\ntraining and also exploits them to improve performance at inference. During\ntraining, image crops of clusters identified from labeled and unlabeled images\nare used to augment the training set, which in turn increases the chance of\ndetecting small objects and creating good pseudo-labels for small objects on\nthe unlabeled images. During inference, the detector is not only able to detect\nthe objects of interest but also regions with a high density of small objects\n(density crops) so that detections from the input image and detections from\nimage crops are combined, resulting in an overall more accurate object\nprediction, especially for small objects. Empirical studies on the popular\nbenchmarks of VisDrone and DOTA datasets show the effectiveness of our density\ncrop-guided semi-supervised detector with an average improvement of more than\n2\\% over the basic mean-teacher method in COCO style AP. Our code is available\nat: https://github.com/akhilpm/DroneSSOD.\n","authors":["Akhil Meethal","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2308.05032v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05026v1","updated":"2023-08-09T15:46:25Z","published":"2023-08-09T15:46:25Z","title":"An End-to-End Framework of Road User Detection, Tracking, and Prediction\n  from Monocular Images","summary":"  Perception that involves multi-object detection and tracking, and trajectory\nprediction are two major tasks of autonomous driving. However, they are\ncurrently mostly studied separately, which results in most trajectory\nprediction modules being developed based on ground truth trajectories without\ntaking into account that trajectories extracted from the detection and tracking\nmodules in real-world scenarios are noisy. These noisy trajectories can have a\nsignificant impact on the performance of the trajectory predictor and can lead\nto serious prediction errors. In this paper, we build an end-to-end framework\nfor detection, tracking, and trajectory prediction called ODTP (Online\nDetection, Tracking and Prediction). It adopts the state-of-the-art online\nmulti-object tracking model, QD-3DT, for perception and trains the trajectory\npredictor, DCENet++, directly based on the detection results without purely\nrelying on ground truth trajectories. We evaluate the performance of ODTP on\nthe widely used nuScenes dataset for autonomous driving. Extensive experiments\nshow that ODPT achieves high performance end-to-end trajectory prediction.\nDCENet++, with the enhanced dynamic maps, predicts more accurate trajectories\nthan its base model. It is also more robust when compared with other generative\nand deterministic trajectory prediction models trained on noisy detection\nresults.\n","authors":["Hao Cheng","Mengmeng liu","Lin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.05026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07273v5","updated":"2023-08-09T15:41:53Z","published":"2022-11-14T11:07:18Z","title":"MLIC: Multi-Reference Entropy Model for Learned Image Compression","summary":"  Recently, learned image compression has achieved remarkable performance. The\nentropy model, which estimates the distribution of the latent representation,\nplays a crucial role in boosting rate-distortion performance. However, most\nentropy models only capture correlations in one dimension, while the latent\nrepresentation contain channel-wise, local spatial, and global spatial\ncorrelations. To tackle this issue, we propose the Multi-Reference Entropy\nModel (MEM) and the advanced version, MEM$^+$. These models capture the\ndifferent types of correlations present in latent representation. Specifically,\nWe first divide the latent representation into slices. When decoding the\ncurrent slice, we use previously decoded slices as context and employ the\nattention map of the previously decoded slice to predict global correlations in\nthe current slice. To capture local contexts, we introduce two enhanced\ncheckerboard context capturing techniques that avoids performance degradation.\nBased on MEM and MEM$^+$, we propose image compression models MLIC and\nMLIC$^+$. Extensive experimental evaluations demonstrate that our MLIC and\nMLIC$^+$ models achieve state-of-the-art performance, reducing BD-rate by\n$8.05\\%$ and $11.39\\%$ on the Kodak dataset compared to VTM-17.0 when measured\nin PSNR. Our code will be available at https://github.com/JiangWeibeta/MLIC.\n","authors":["Wei Jiang","Jiayu Yang","Yongqi Zhai","Peirong Ning","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2211.07273v5.pdf","comment":"Accepted at ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.05022v1","updated":"2023-08-09T15:38:36Z","published":"2023-08-09T15:38:36Z","title":"Feature Modulation Transformer: Cross-Refinement of Global\n  Representation via High-Frequency Prior for Image Super-Resolution","summary":"  Transformer-based methods have exhibited remarkable potential in single image\nsuper-resolution (SISR) by effectively extracting long-range dependencies.\nHowever, most of the current research in this area has prioritized the design\nof transformer blocks to capture global information, while overlooking the\nimportance of incorporating high-frequency priors, which we believe could be\nbeneficial. In our study, we conducted a series of experiments and found that\ntransformer structures are more adept at capturing low-frequency information,\nbut have limited capacity in constructing high-frequency representations when\ncompared to their convolutional counterparts. Our proposed solution, the\ncross-refinement adaptive feature modulation transformer (CRAFT), integrates\nthe strengths of both convolutional and transformer structures. It comprises\nthree key components: the high-frequency enhancement residual block (HFERB) for\nextracting high-frequency information, the shift rectangle window attention\nblock (SRWAB) for capturing global information, and the hybrid fusion block\n(HFB) for refining the global representation. Our experiments on multiple\ndatasets demonstrate that CRAFT outperforms state-of-the-art methods by up to\n0.29dB while using fewer parameters. The source code will be made available at:\nhttps://github.com/AVC2-UESTC/CRAFT-SR.git.\n","authors":["Ao Li","Le Zhang","Yun Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.05022v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.05021v1","updated":"2023-08-09T15:31:17Z","published":"2023-08-09T15:31:17Z","title":"Do Diffusion Models Suffer Error Propagation? Theoretical Analysis and\n  Consistency Regularization","summary":"  While diffusion models have achieved promising performances in data\nsynthesis, they might suffer error propagation because of their cascade\nstructure, where the distributional mismatch spreads and magnifies through the\nchain of denoising modules. However, a strict analysis is expected since many\nsequential models such as Conditional Random Field (CRF) are free from error\npropagation. In this paper, we empirically and theoretically verify that\ndiffusion models are indeed affected by error propagation and we then propose a\nregularization to address this problem. Our theoretical analysis reveals that\nthe question can be reduced to whether every denoising module of the diffusion\nmodel is fault-tolerant. We derive insightful transition equations, indicating\nthat the module can't recover from input errors and even propagates additional\nerrors to the next module. Our analysis directly leads to a consistency\nregularization scheme for diffusion models, which explicitly reduces the\ndistribution gap between forward and backward processes. We further introduce a\nbootstrapping algorithm to reduce the computation cost of the regularizer. Our\nexperimental results on multiple image datasets show that our regularization\neffectively handles error propagation and significantly improves the\nperformance of vanilla diffusion models.\n","authors":["Yangming Li","Zhaozhi Qian","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2308.05021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05005v1","updated":"2023-08-09T15:05:41Z","published":"2023-08-09T15:05:41Z","title":"Deep Learning Model Transfer in Forest Mapping using Multi-source\n  Satellite SAR and Optical Images","summary":"  Deep learning (DL) models are gaining popularity in forest variable\nprediction using Earth Observation images. However, in practical forest\ninventories, reference datasets are often represented by plot- or stand-level\nmeasurements, while high-quality representative wall-to-wall reference data for\nend-to-end training of DL models are rarely available. Transfer learning\nfacilitates expansion of the use of deep learning models into areas with\nsub-optimal training data by allowing pretraining of the model in areas where\nhigh-quality teaching data are available. In this study, we perform a \"model\ntransfer\" (or domain adaptation) of a pretrained DL model into a target area\nusing plot-level measurements and compare performance versus other machine\nlearning models. We use an earlier developed UNet based model (SeUNet) to\ndemonstrate the approach on two distinct taiga sites with varying forest\nstructure and composition. Multisource Earth Observation (EO) data are\nrepresented by a combination of Copernicus Sentinel-1 C-band SAR and Sentinel-2\nmultispectral images, JAXA ALOS-2 PALSAR-2 SAR mosaic and TanDEM-X bistatic\ninterferometric radar data. The training study site is located in Finnish\nLapland, while the target site is located in Southern Finland. By leveraging\ntransfer learning, the prediction of SeUNet achieved root mean squared error\n(RMSE) of 2.70 m and R$^2$ of 0.882, considerably more accurate than\ntraditional benchmark methods. We expect such forest-specific DL model transfer\ncan be suitable also for other forest variables and other EO data sources that\nare sensitive to forest structure.\n","authors":["Shaojia Ge","Oleg Antropov","Tuomas Häme","Ronald E. McRoberts","Jukka Miettinen"],"pdf_url":"https://arxiv.org/pdf/2308.05005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.05119v4","updated":"2023-08-09T14:56:13Z","published":"2022-03-10T02:35:39Z","title":"MetAug: Contrastive Learning via Meta Feature Augmentation","summary":"  What matters for contrastive learning? We argue that contrastive learning\nheavily relies on informative features, or \"hard\" (positive or negative)\nfeatures. Early works include more informative features by applying complex\ndata augmentations and large batch size or memory bank, and recent works design\nelaborate sampling approaches to explore informative features. The key\nchallenge toward exploring such features is that the source multi-view data is\ngenerated by applying random data augmentations, making it infeasible to always\nadd useful information in the augmented data. Consequently, the informativeness\nof features learned from such augmented data is limited. In response, we\npropose to directly augment the features in latent space, thereby learning\ndiscriminative representations without a large amount of input data. We perform\na meta learning technique to build the augmentation generator that updates its\nnetwork parameters by considering the performance of the encoder. However,\ninsufficient input data may lead the encoder to learn collapsed features and\ntherefore malfunction the augmentation generator. A new margin-injected\nregularization is further added in the objective function to avoid the encoder\nlearning a degenerate mapping. To contrast all features in one gradient\nback-propagation step, we adopt the proposed optimization-driven unified\ncontrastive loss instead of the conventional contrastive loss. Empirically, our\nmethod achieves state-of-the-art results on several benchmark datasets.\n","authors":["Jiangmeng Li","Wenwen Qiang","Changwen Zheng","Bing Su","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2203.05119v4.pdf","comment":"Accepted by ICML 2022"},{"id":"http://arxiv.org/abs/2209.07811v2","updated":"2023-08-09T14:49:42Z","published":"2022-09-16T09:24:00Z","title":"Modeling Multiple Views via Implicitly Preserving Global Consistency and\n  Local Complementarity","summary":"  While self-supervised learning techniques are often used to mining implicit\nknowledge from unlabeled data via modeling multiple views, it is unclear how to\nperform effective representation learning in a complex and inconsistent\ncontext. To this end, we propose a methodology, specifically consistency and\ncomplementarity network (CoCoNet), which avails of strict global inter-view\nconsistency and local cross-view complementarity preserving regularization to\ncomprehensively learn representations from multiple views. On the global stage,\nwe reckon that the crucial knowledge is implicitly shared among views, and\nenhancing the encoder to capture such knowledge from data can improve the\ndiscriminability of the learned representations. Hence, preserving the global\nconsistency of multiple views ensures the acquisition of common knowledge.\nCoCoNet aligns the probabilistic distribution of views by utilizing an\nefficient discrepancy metric measurement based on the generalized sliced\nWasserstein distance. Lastly on the local stage, we propose a heuristic\ncomplementarity-factor, which joints cross-view discriminative knowledge, and\nit guides the encoders to learn not only view-wise discriminability but also\ncross-view complementary information. Theoretically, we provide the\ninformation-theoretical-based analyses of our proposed CoCoNet. Empirically, to\ninvestigate the improvement gains of our approach, we conduct adequate\nexperimental validations, which demonstrate that CoCoNet outperforms the\nstate-of-the-art self-supervised methods by a significant margin proves that\nsuch implicit consistency and complementarity preserving regularization can\nenhance the discriminability of latent representations.\n","authors":["Jiangmeng Li","Wenwen Qiang","Changwen Zheng","Bing Su","Farid Razzak","Ji-Rong Wen","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2209.07811v2.pdf","comment":"Accepted by IEEE Transactions on Knowledge and Data Engineering\n  (TKDE) 2022; Refer to https://ieeexplore.ieee.org/document/9857632"},{"id":"http://arxiv.org/abs/2308.04995v1","updated":"2023-08-09T14:48:31Z","published":"2023-08-09T14:48:31Z","title":"IDiff-Face: Synthetic-based Face Recognition through Fizzy\n  Identity-Conditioned Diffusion Models","summary":"  The availability of large-scale authentic face databases has been crucial to\nthe significant advances made in face recognition research over the past\ndecade. However, legal and ethical concerns led to the recent retraction of\nmany of these databases by their creators, raising questions about the\ncontinuity of future face recognition research without one of its key\nresources. Synthetic datasets have emerged as a promising alternative to\nprivacy-sensitive authentic data for face recognition development. However,\nrecent synthetic datasets that are used to train face recognition models suffer\neither from limitations in intra-class diversity or cross-class (identity)\ndiscrimination, leading to less optimal accuracies, far away from the\naccuracies achieved by models trained on authentic data. This paper targets\nthis issue by proposing IDiff-Face, a novel approach based on conditional\nlatent diffusion models for synthetic identity generation with realistic\nidentity variations for face recognition training. Through extensive\nevaluations, our proposed synthetic-based face recognition approach pushed the\nlimits of state-of-the-art performances, achieving, for example, 98.00%\naccuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the\nrecent synthetic-based face recognition solutions with 95.40% and bridging the\ngap to authentic-based face recognition with 99.82% accuracy.\n","authors":["Fadi Boutros","Jonas Henry Grebe","Arjan Kuijper","Naser Dame"],"pdf_url":"https://arxiv.org/pdf/2308.04995v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.04990v1","updated":"2023-08-09T14:43:10Z","published":"2023-08-09T14:43:10Z","title":"Foreground Object Search by Distilling Composite Image Feature","summary":"  Foreground object search (FOS) aims to find compatible foreground objects for\na given background image, producing realistic composite image. We observe that\ncompetitive retrieval performance could be achieved by using a discriminator to\npredict the compatibility of composite image, but this approach has\nunaffordable time cost. To this end, we propose a novel FOS method via\ndistilling composite feature (DiscoFOS). Specifically, the abovementioned\ndiscriminator serves as teacher network. The student network employs two\nencoders to extract foreground feature and background feature. Their\ninteraction output is enforced to match the composite image feature from the\nteacher network. Additionally, previous works did not release their datasets,\nso we contribute two datasets for FOS task: S-FOSD dataset with synthetic\ncomposite images and R-FOSD dataset with real composite images. Extensive\nexperiments on our two datasets demonstrate the superiority of the proposed\nmethod over previous approaches. The dataset and code are available at\nhttps://github.com/bcmi/Foreground-Object-Search-Dataset-FOSD.\n","authors":["Bo Zhang","Jiacheng Sui","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2308.04990v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04987v1","updated":"2023-08-09T14:40:51Z","published":"2023-08-09T14:40:51Z","title":"Self-supervised Landmark Learning with Deformation Reconstruction and\n  Cross-subject Consistency Objectives","summary":"  A Point Distribution Model (PDM) is the basis of a Statistical Shape Model\n(SSM) that relies on a set of landmark points to represent a shape and\ncharacterize the shape variation. In this work, we present a self-supervised\napproach to extract landmark points from a given registration model for the\nPDMs. Based on the assumption that the landmarks are the points that have the\nmost influence on registration, existing works learn a point-based registration\nmodel with a small number of points to estimate the landmark points that\ninfluence the deformation the most. However, such approaches assume that the\ndeformation can be captured by point-based registration and quality landmarks\ncan be learned solely with the deformation capturing objective. We argue that\ndata with complicated deformations can not easily be modeled with point-based\nregistration when only a limited number of points is used to extract\ninfluential landmark points. Further, landmark consistency is not assured in\nexisting approaches In contrast, we propose to extract landmarks based on a\ngiven registration model, which is tailored for the target data, so we can\nobtain more accurate correspondences. Secondly, to establish the anatomical\nconsistency of the predicted landmarks, we introduce a landmark discovery loss\nto explicitly encourage the model to predict the landmarks that are\nanatomically consistent across subjects. We conduct experiments on an\nosteoarthritis progression prediction task and show our method outperforms\nexisting image-based and point-based approaches.\n","authors":["Chun-Hung Chao","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2308.04987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.01615v4","updated":"2023-08-09T14:15:32Z","published":"2022-01-05T13:51:20Z","title":"Lawin Transformer: Improving Semantic Segmentation Transformer with\n  Multi-Scale Representations via Large Window Attention","summary":"  Multi-scale representations are crucial for semantic segmentation. The\ncommunity has witnessed the flourish of semantic segmentation convolutional\nneural networks (CNN) exploiting multi-scale contextual information. Motivated\nby that the vision transformer (ViT) is powerful in image classification, some\nsemantic segmentation ViTs are recently proposed, most of them attaining\nimpressive results but at a cost of computational economy. In this paper, we\nsucceed in introducing multi-scale representations into semantic segmentation\nViT via window attention mechanism and further improves the performance and\nefficiency. To this end, we introduce large window attention which allows the\nlocal window to query a larger area of context window at only a little\ncomputation overhead. By regulating the ratio of the context area to the query\narea, we enable the $\\textit{large window attention}$ to capture the contextual\ninformation at multiple scales. Moreover, the framework of spatial pyramid\npooling is adopted to collaborate with $\\textit{the large window attention}$,\nwhich presents a novel decoder named $\\textbf{la}$rge $\\textbf{win}$dow\nattention spatial pyramid pooling (LawinASPP) for semantic segmentation ViT.\nOur resulting ViT, Lawin Transformer, is composed of an efficient hierachical\nvision transformer (HVT) as encoder and a LawinASPP as decoder. The empirical\nresults demonstrate that Lawin Transformer offers an improved efficiency\ncompared to the existing method. Lawin Transformer further sets new\nstate-of-the-art performance on Cityscapes (84.4% mIoU), ADE20K (56.2% mIoU)\nand COCO-Stuff datasets. The code will be released at\nhttps://github.com/yan-hao-tian/lawin\n","authors":["Haotian Yan","Chuang Zhang","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2201.01615v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05144v3","updated":"2023-08-09T14:12:34Z","published":"2023-05-09T03:10:15Z","title":"Adapt and Align to Improve Zero-Shot Sketch-Based Image Retrieval","summary":"  Zero-shot sketch-based image retrieval (ZS-SBIR) is challenging due to the\ncross-domain nature of sketches and photos, as well as the semantic gap between\nseen and unseen image distributions. Previous methods fine-tune pre-trained\nmodels with various side information and learning strategies to learn a compact\nfeature space that is shared between the sketch and photo domains and bridges\nseen and unseen classes. However, these efforts are inadequate in adapting\ndomains and transferring knowledge from seen to unseen classes. In this paper,\nwe present an effective ``Adapt and Align'' approach to address the key\nchallenges. Specifically, we insert simple and lightweight domain adapters to\nlearn new abstract concepts of the sketch domain and improve cross-domain\nrepresentation capabilities. Inspired by recent advances in image-text\nfoundation models (e.g., CLIP) on zero-shot scenarios, we explicitly align the\nlearned image embedding with a more semantic text embedding to achieve the\ndesired knowledge transfer from seen to unseen classes. Extensive experiments\non three benchmark datasets and two popular backbones demonstrate the\nsuperiority of our method in terms of retrieval accuracy and flexibility.\n","authors":["Shiyin Dong","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2305.05144v3.pdf","comment":"10 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2212.10445v3","updated":"2023-08-09T14:02:55Z","published":"2022-12-20T17:21:46Z","title":"Model Ratatouille: Recycling Diverse Models for Out-of-Distribution\n  Generalization","summary":"  Foundation models are redefining how AI systems are built. Practitioners now\nfollow a standard procedure to build their machine learning solutions: from a\npre-trained foundation model, they fine-tune the weights on the target task of\ninterest. So, the Internet is swarmed by a handful of foundation models\nfine-tuned on many diverse tasks: these individual fine-tunings exist in\nisolation without benefiting from each other. In our opinion, this is a missed\nopportunity, as these specialized models contain rich and diverse features. In\nthis paper, we thus propose model ratatouille, a new strategy to recycle the\nmultiple fine-tunings of the same foundation model on diverse auxiliary tasks.\nSpecifically, we repurpose these auxiliary weights as initializations for\nmultiple parallel fine-tunings on the target task; then, we average all\nfine-tuned weights to obtain the final model. This recycling strategy aims at\nmaximizing the diversity in weights by leveraging the diversity in auxiliary\ntasks. Empirically, it improves the state of the art on the reference DomainBed\nbenchmark for out-of-distribution generalization. Looking forward, this work\ncontributes to the emerging paradigm of updatable machine learning where, akin\nto open-source software development, the community collaborates to reliably\nupdate machine learning models. Our code is released:\nhttps://github.com/facebookresearch/ModelRatatouille.\n","authors":["Alexandre Ramé","Kartik Ahuja","Jianyu Zhang","Matthieu Cord","Léon Bottou","David Lopez-Paz"],"pdf_url":"https://arxiv.org/pdf/2212.10445v3.pdf","comment":"24 pages, 10 tables, 21 figures"},{"id":"http://arxiv.org/abs/2308.04956v1","updated":"2023-08-09T13:41:30Z","published":"2023-08-09T13:41:30Z","title":"ACE-HetEM for ab initio Heterogenous Cryo-EM 3D Reconstruction","summary":"  Due to the extremely low signal-to-noise ratio (SNR) and unknown poses\n(projection angles and image translation) in cryo-EM experiments,\nreconstructing 3D structures from 2D images is very challenging. On top of\nthese challenges, heterogeneous cryo-EM reconstruction also has an additional\nrequirement: conformation classification. An emerging solution to this problem\nis called amortized inference, implemented using the autoencoder architecture\nor its variants. Instead of searching for the correct\nimage-to-pose/conformation mapping for every image in the dataset as in\nnon-amortized methods, amortized inference only needs to train an encoder that\nmaps images to appropriate latent spaces representing poses or conformations.\nUnfortunately, standard amortized-inference-based methods with entangled latent\nspaces have difficulty learning the distribution of conformations and poses\nfrom cryo-EM images. In this paper, we propose an unsupervised deep learning\narchitecture called \"ACE-HetEM\" based on amortized inference. To explicitly\nenforce the disentanglement of conformation classifications and pose\nestimations, we designed two alternating training tasks in our method:\nimage-to-image task and pose-to-pose task. Results on simulated datasets show\nthat ACE-HetEM has comparable accuracy in pose estimation and produces even\nbetter reconstruction resolution than non-amortized methods. Furthermore, we\nshow that ACE-HetEM is also applicable to real experimental datasets.\n","authors":["Weijie Chen","Lin Yao","Zeqing Xia","Yuhang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04956v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.04952v1","updated":"2023-08-09T13:38:52Z","published":"2023-08-09T13:38:52Z","title":"Prototypical Kernel Learning and Open-set Foreground Perception for\n  Generalized Few-shot Semantic Segmentation","summary":"  Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic\nSegmentation (FSS) to simultaneously segment unseen classes and seen classes\nduring evaluation. Previous works leverage additional branch or prototypical\naggregation to eliminate the constrained setting of FSS. However,\nrepresentation division and embedding prejudice, which heavily results in poor\nperformance of GFSS, have not been synthetical considered. We address the\naforementioned problems by jointing the prototypical kernel learning and\nopen-set foreground perception. Specifically, a group of learnable kernels is\nproposed to perform segmentation with each kernel in charge of a stuff class.\nThen, we explore to merge the prototypical learning to the update of base-class\nkernels, which is consistent with the prototype knowledge aggregation of\nfew-shot novel classes. In addition, a foreground contextual perception module\ncooperating with conditional bias based inference is adopted to perform\nclass-agnostic as well as open-set foreground detection, thus to mitigate the\nembedding prejudice and prevent novel targets from being misclassified as\nbackground. Moreover, we also adjust our method to the Class Incremental\nFew-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel\nclasses in a incremental stream. Extensive experiments on PASCAL-5i and\nCOCO-20i datasets demonstrate that our method performs better than previous\nstate-of-the-art.\n","authors":["Kai Huang","Feigege Wang","Ye Xi","Yutao Gao"],"pdf_url":"https://arxiv.org/pdf/2308.04952v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.04949v1","updated":"2023-08-09T13:32:10Z","published":"2023-08-09T13:32:10Z","title":"Branches Mutual Promotion for End-to-End Weakly Supervised Semantic\n  Segmentation","summary":"  End-to-end weakly supervised semantic segmentation aims at optimizing a\nsegmentation model in a single-stage training process based on only image\nannotations. Existing methods adopt an online-trained classification branch to\nprovide pseudo annotations for supervising the segmentation branch. However,\nthis strategy makes the classification branch dominate the whole concurrent\ntraining process, hindering these two branches from assisting each other. In\nour work, we treat these two branches equally by viewing them as diverse ways\nto generate the segmentation map, and add interactions on both their\nsupervision and operation to achieve mutual promotion. For this purpose, a\nbidirectional supervision mechanism is elaborated to force the consistency\nbetween the outputs of these two branches. Thus, the segmentation branch can\nalso give feedback to the classification branch to enhance the quality of\nlocalization seeds. Moreover, our method also designs interaction operations\nbetween these two branches to exchange their knowledge to assist each other.\nExperiments indicate our work outperforms existing end-to-end weakly supervised\nsegmentation methods.\n","authors":["Lei Zhu","Hangzhou He","Xinliang Zhang","Qian Chen","Shuang Zeng","Qiushi Ren","Yanye Lu"],"pdf_url":"https://arxiv.org/pdf/2308.04949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04946v1","updated":"2023-08-09T13:24:55Z","published":"2023-08-09T13:24:55Z","title":"SelectNAdapt: Support Set Selection for Few-Shot Domain Adaptation","summary":"  Generalisation of deep neural networks becomes vulnerable when distribution\nshifts are encountered between train (source) and test (target) domain data.\nFew-shot domain adaptation mitigates this issue by adapting deep neural\nnetworks pre-trained on the source domain to the target domain using a randomly\nselected and annotated support set from the target domain. This paper argues\nthat randomly selecting the support set can be further improved for effectively\nadapting the pre-trained source models to the target domain. Alternatively, we\npropose SelectNAdapt, an algorithm to curate the selection of the target domain\nsamples, which are then annotated and included in the support set. In\nparticular, for the K-shot adaptation problem, we first leverage\nself-supervision to learn features of the target domain data. Then, we propose\na per-class clustering scheme of the learned target domain features and select\nK representative target samples using a distance-based scoring function.\nFinally, we bring our selection setup towards a practical ground by relying on\npseudo-labels for clustering semantically similar target domain samples. Our\nexperiments show promising results on three few-shot domain adaptation\nbenchmarks for image recognition compared to related approaches and the\nstandard random selection.\n","authors":["Youssef Dawoud","Gustavo Carneiro","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2308.04946v1.pdf","comment":"Accepted to ICCV Workshop"},{"id":"http://arxiv.org/abs/2308.04383v2","updated":"2023-08-09T13:21:56Z","published":"2023-08-08T16:37:24Z","title":"DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point\n  Clouds","summary":"  Point clouds are naturally sparse, while image pixels are dense. The\ninconsistency limits feature fusion from both modalities for point-wise scene\nflow estimation. Previous methods rarely predict scene flow from the entire\npoint clouds of the scene with one-time inference due to the memory\ninefficiency and heavy overhead from distance calculation and sorting involved\nin commonly used farthest point sampling, KNN, and ball query algorithms for\nlocal feature aggregation. To mitigate these issues in scene flow learning, we\nregularize raw points to a dense format by storing 3D coordinates in 2D grids.\nUnlike the sampling operation commonly used in existing works, the dense 2D\nrepresentation 1) preserves most points in the given scene, 2) brings in a\nsignificant boost of efficiency, and 3) eliminates the density gap between\npoints and pixels, allowing us to perform effective feature fusion. We also\npresent a novel warping projection technique to alleviate the information loss\nproblem resulting from the fact that multiple points could be mapped into one\ngrid during projection when computing cost volume. Sufficient experiments\ndemonstrate the efficiency and effectiveness of our method, outperforming the\nprior-arts on the FlyingThings3D and KITTI dataset.\n","authors":["Chensheng Peng","Guangming Wang","Xian Wan Lo","Xinrui Wu","Chenfeng Xu","Masayoshi Tomizuka","Wei Zhan","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04383v2.pdf","comment":"Accepted by ICCV2023. Codes will be released at\n  https://github.com/IRMVLab/DELFlow"},{"id":"http://arxiv.org/abs/2302.06436v2","updated":"2023-08-09T13:19:29Z","published":"2023-02-13T15:15:18Z","title":"Geometric Constraints Enable Self-Supervised Sinogram Inpainting in\n  Sparse-View Tomography","summary":"  The diagnostic quality of computed tomography (CT) scans is usually\nrestricted by the induced patient dose, scan speed, and image quality.\nSparse-angle tomographic scans reduce radiation exposure and accelerate data\nacquisition, but suffer from image artifacts and noise. Existing image\nprocessing algorithms can restore CT reconstruction quality but often require\nlarge training data sets or can not be used for truncated objects. This work\npresents a self-supervised projection inpainting method that allows optimizing\nmissing projective views via gradient-based optimization. By reconstructing\nindependent stacks of projection data, a self-supervised loss is calculated in\nthe CT image domain and used to directly optimize projection image intensities\nto match the missing tomographic views constrained by the projection geometry.\nOur experiments on real X-ray microscope (XRM) tomographic mouse tibia bone\nscans show that our method improves reconstructions by 3.1-7.4%/7.7-17.6% in\nterms of PSNR/SSIM with respect to the interpolation baseline. Our approach is\napplicable as a flexible self-supervised projection inpainting tool for\ntomographic applications.\n","authors":["Fabian Wagner","Mareike Thies","Noah Maul","Laura Pfaff","Oliver Aust","Sabrina Pechmann","Christopher Syben","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2302.06436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04944v1","updated":"2023-08-09T13:19:28Z","published":"2023-08-09T13:19:28Z","title":"Gaussian Image Anomaly Detection with Greedy Eigencomponent Selection","summary":"  Anomaly detection (AD) in images, identifying significant deviations from\nnormality, is a critical issue in computer vision. This paper introduces a\nnovel approach to dimensionality reduction for AD using pre-trained\nconvolutional neural network (CNN) that incorporate EfficientNet models. We\ninvestigate the importance of component selection and propose two types of tree\nsearch approaches, both employing a greedy strategy, for optimal eigencomponent\nselection. Our study conducts three main experiments to evaluate the\neffectiveness of our approach. The first experiment explores the influence of\ntest set performance on component choice, the second experiment examines the\nperformance when we train on one anomaly type and evaluate on all other types,\nand the third experiment investigates the impact of using a minimum number of\nimages for training and selecting them based on anomaly types. Our approach\naims to find the optimal subset of components that deliver the highest\nperformance score, instead of focusing solely on the proportion of variance\nexplained by each component and also understand the components behaviour in\ndifferent settings. Our results indicate that the proposed method surpasses\nboth Principal Component Analysis (PCA) and Negated Principal Component\nAnalysis (NPCA) in terms of detection accuracy, even when using fewer\ncomponents. Thus, our approach provides a promising alternative to conventional\ndimensionality reduction techniques in AD, and holds potential to enhance the\nefficiency and effectiveness of AD systems.\n","authors":["Tetiana Gula","João P C Bertoldo"],"pdf_url":"https://arxiv.org/pdf/2308.04944v1.pdf","comment":"28 pages, 14 figures, accepted to 2023 official workshop of the\n  LatinX in Computer Vision (LXCV) at ICCV"},{"id":"http://arxiv.org/abs/2308.04402v2","updated":"2023-08-09T13:15:37Z","published":"2023-08-08T17:04:53Z","title":"Person Re-Identification without Identification via Event Anonymization","summary":"  Wide-scale use of visual surveillance in public spaces puts individual\nprivacy at stake while increasing resource consumption (energy, bandwidth, and\ncomputation). Neuromorphic vision sensors (event-cameras) have been recently\nconsidered a valid solution to the privacy issue because they do not capture\ndetailed RGB visual information of the subjects in the scene. However, recent\ndeep learning architectures have been able to reconstruct images from event\ncameras with high fidelity, reintroducing a potential threat to privacy for\nevent-based vision applications. In this paper, we aim to anonymize\nevent-streams to protect the identity of human subjects against such image\nreconstruction attacks. To achieve this, we propose an end-to-end network\narchitecture jointly optimized for the twofold objective of preserving privacy\nand performing a downstream task such as person ReId. Our network learns to\nscramble events, enforcing the degradation of images recovered from the privacy\nattacker. In this work, we also bring to the community the first ever\nevent-based person ReId dataset gathered to evaluate the performance of our\napproach. We validate our approach with extensive experiments and report\nresults on the synthetic event data simulated from the publicly available\nSoftBio dataset and our proposed Event-ReId dataset.\n","authors":["Shafiq Ahmad","Pietro Morerio","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.04402v2.pdf","comment":"Accepted at International Conference on Computer Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.04934v1","updated":"2023-08-09T13:09:07Z","published":"2023-08-09T13:09:07Z","title":"JEDI: Joint Expert Distillation in a Semi-Supervised Multi-Dataset\n  Student-Teacher Scenario for Video Action Recognition","summary":"  We propose JEDI, a multi-dataset semi-supervised learning method, which\nefficiently combines knowledge from multiple experts, learned on different\ndatasets, to train and improve the performance of individual, per dataset,\nstudent models. Our approach achieves this by addressing two important problems\nin current machine learning research: generalization across datasets and\nlimitations of supervised training due to scarcity of labeled data. We start\nwith an arbitrary number of experts, pretrained on their own specific dataset,\nwhich form the initial set of student models. The teachers are immediately\nderived by concatenating the feature representations from the penultimate\nlayers of the students. We then train all models in a student-teacher\nsemi-supervised learning scenario until convergence. In our efficient approach,\nstudent-teacher training is carried out jointly and end-to-end, showing that\nboth students and teachers improve their generalization capacity during\ntraining. We validate our approach on four video action recognition datasets.\nBy simultaneously considering all datasets within a unified semi-supervised\nsetting, we demonstrate significant improvements over the initial experts.\n","authors":["Lucian Bicsi","Bogdan Alexe","Radu Tudor Ionescu","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2308.04934v1.pdf","comment":"Accepted in ICCV 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.04928v1","updated":"2023-08-09T12:54:27Z","published":"2023-08-09T12:54:27Z","title":"GeodesicPSIM: Predicting the Quality of Static Mesh with Texture Map via\n  Geodesic Patch Similarity","summary":"  Static meshes with texture maps have attracted considerable attention in both\nindustrial manufacturing and academic research, leading to an urgent\nrequirement for effective and robust objective quality evaluation. However,\ncurrent model-based static mesh quality metrics have obvious limitations: most\nof them only consider geometry information, while color information is ignored,\nand they have strict constraints for the meshes' geometrical topology. Other\nmetrics, such as image-based and point-based metrics, are easily influenced by\nthe prepossessing algorithms, e.g., projection and sampling, hampering their\nability to perform at their best. In this paper, we propose Geodesic Patch\nSimilarity (GeodesicPSIM), a novel model-based metric to accurately predict\nhuman perception quality for static meshes. After selecting a group keypoints,\n1-hop geodesic patches are constructed based on both the reference and\ndistorted meshes cleaned by an effective mesh cleaning algorithm. A two-step\npatch cropping algorithm and a patch texture mapping module refine the size of\n1-hop geodesic patches and build the relationship between the mesh geometry and\ncolor information, resulting in the generation of 1-hop textured geodesic\npatches. Three types of features are extracted to quantify the distortion:\npatch color smoothness, patch discrete mean curvature, and patch pixel color\naverage and variance. To the best of our knowledge, GeodesicPSIM is the first\nmodel-based metric especially designed for static meshes with texture maps.\nGeodesicPSIM provides state-of-the-art performance in comparison with\nimage-based, point-based, and video-based metrics on a newly created and\nchallenging database. We also prove the robustness of GeodesicPSIM by\nintroducing different settings of hyperparameters. Ablation studies also\nexhibit the effectiveness of three proposed features and the patch cropping\nalgorithm.\n","authors":["Qi Yang","Joel Jung","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04923v1","updated":"2023-08-09T12:45:18Z","published":"2023-08-09T12:45:18Z","title":"Deep Learning-Based Prediction of Fractional Flow Reserve along the\n  Coronary Artery","summary":"  Functionally significant coronary artery disease (CAD) is caused by plaque\nbuildup in the coronary arteries, potentially leading to narrowing of the\narterial lumen, i.e. coronary stenosis, that significantly obstructs blood flow\nto the myocardium. The current reference for establishing the presence of a\nfunctionally significant stenosis is invasive fractional flow reserve (FFR)\nmeasurement. To avoid invasive measurements, non-invasive prediction of FFR\nfrom coronary CT angiography (CCTA) has emerged. For this, machine learning\napproaches, characterized by fast inference, are increasingly developed.\nHowever, these methods predict a single FFR value per artery i.e. they don't\nprovide information about the stenosis location or treatment strategy. We\npropose a deep learning-based method to predict the FFR along the artery from\nCCTA scans. This study includes CCTA images of 110 patients who underwent\ninvasive FFR pullback measurement in 112 arteries. First, a multi planar\nreconstruction (MPR) of the artery is fed to a variational autoencoder to\ncharacterize the artery, i.e. through the lumen area and unsupervised artery\nencodings. Thereafter, a convolutional neural network (CNN) predicts the FFR\nalong the artery. The CNN is supervised by multiple loss functions, notably a\nloss function inspired by the Earth Mover's Distance (EMD) to predict the\ncorrect location of FFR drops and a histogram-based loss to explicitly\nsupervise the slope of the FFR curve. To train and evaluate our model,\neight-fold cross-validation was performed. The resulting FFR curves show good\nagreement with the reference allowing the distinction between diffuse and focal\nCAD distributions in most cases. Quantitative evaluation yielded a mean\nabsolute difference in the area under the FFR pullback curve (AUPC) of 1.7. The\nmethod may pave the way towards fast, accurate, automatic prediction of FFR\nalong the artery from CCTA.\n","authors":["Nils Hampe","Sanne G. M. van Velzen","Jean-Paul Aben","Carlos Collet","Ivana Išgum"],"pdf_url":"https://arxiv.org/pdf/2308.04923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09417v3","updated":"2023-08-09T12:41:48Z","published":"2023-06-15T18:02:49Z","title":"Diff-TTSG: Denoising probabilistic integrated speech and gesture\n  synthesis","summary":"  With read-aloud speech synthesis achieving high naturalness scores, there is\na growing research interest in synthesising spontaneous speech. However, human\nspontaneous face-to-face conversation has both spoken and non-verbal aspects\n(here, co-speech gestures). Only recently has research begun to explore the\nbenefits of jointly synthesising these two modalities in a single system. The\nprevious state of the art used non-probabilistic methods, which fail to capture\nthe variability of human speech and motion, and risk producing oversmoothing\nartefacts and sub-optimal synthesis quality. We present the first\ndiffusion-based probabilistic model, called Diff-TTSG, that jointly learns to\nsynthesise speech and gestures together. Our method can be trained on small\ndatasets from scratch. Furthermore, we describe a set of careful uni- and\nmulti-modal subjective tests for evaluating integrated speech and gesture\nsynthesis systems, and use them to validate our proposed approach. Please see\nhttps://shivammehta25.github.io/Diff-TTSG/ for video examples, data, and code.\n","authors":["Shivam Mehta","Siyang Wang","Simon Alexanderson","Jonas Beskow","Éva Székely","Gustav Eje Henter"],"pdf_url":"https://arxiv.org/pdf/2306.09417v3.pdf","comment":"7 pages, 2 figures, presented at the ISCA Speech Synthesis Workshop\n  (SSW) 2023"},{"id":"http://arxiv.org/abs/2308.04912v1","updated":"2023-08-09T12:23:41Z","published":"2023-08-09T12:23:41Z","title":"Cross-view Semantic Alignment for Livestreaming Product Recognition","summary":"  Live commerce is the act of selling products online through live streaming.\nThe customer's diverse demands for online products introduce more challenges to\nLivestreaming Product Recognition. Previous works have primarily focused on\nfashion clothing data or utilize single-modal input, which does not reflect the\nreal-world scenario where multimodal data from various categories are present.\nIn this paper, we present LPR4M, a large-scale multimodal dataset that covers\n34 categories, comprises 3 modalities (image, video, and text), and is 50?\nlarger than the largest publicly available dataset. LPR4M contains diverse\nvideos and noise modality pairs while exhibiting a long-tailed distribution,\nresembling real-world problems. Moreover, a cRoss-vIew semantiC alignmEnt\n(RICE) model is proposed to learn discriminative instance features from the\nimage and video views of the products. This is achieved through instance-level\ncontrastive learning and cross-view patch-level feature propagation. A novel\nPatch Feature Reconstruction loss is proposed to penalize the semantic\nmisalignment between cross-view patches. Extensive experiments demonstrate the\neffectiveness of RICE and provide insights into the importance of dataset\ndiversity and expressivity. The dataset and code are available at\nhttps://github.com/adxcreative/RICE\n","authors":["Wenjie Yang","Yiyi Chen","Yan Li","Yanhua Cheng","Xudong Liu","Quan Chen","Han Li"],"pdf_url":"https://arxiv.org/pdf/2308.04912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04911v1","updated":"2023-08-09T12:22:49Z","published":"2023-08-09T12:22:49Z","title":"SLPT: Selective Labeling Meets Prompt Tuning on Label-Limited Lesion\n  Segmentation","summary":"  Medical image analysis using deep learning is often challenged by limited\nlabeled data and high annotation costs. Fine-tuning the entire network in\nlabel-limited scenarios can lead to overfitting and suboptimal performance.\nRecently, prompt tuning has emerged as a more promising technique that\nintroduces a few additional tunable parameters as prompts to a task-agnostic\npre-trained model, and updates only these parameters using supervision from\nlimited labeled data while keeping the pre-trained model unchanged. However,\nprevious work has overlooked the importance of selective labeling in downstream\ntasks, which aims to select the most valuable downstream samples for annotation\nto achieve the best performance with minimum annotation cost. To address this,\nwe propose a framework that combines selective labeling with prompt tuning\n(SLPT) to boost performance in limited labels. Specifically, we introduce a\nfeature-aware prompt updater to guide prompt tuning and a TandEm Selective\nLAbeling (TESLA) strategy. TESLA includes unsupervised diversity selection and\nsupervised selection using prompt-based uncertainty. In addition, we propose a\ndiversified visual prompt tuning strategy to provide multi-prompt-based\ndiscrepant predictions for TESLA. We evaluate our method on liver tumor\nsegmentation and achieve state-of-the-art performance, outperforming\ntraditional fine-tuning with only 6% of tunable parameters, also achieving 94%\nof full-data performance by labeling only 5% of the data.\n","authors":["Fan Bai","Ke Yan","Xiaoyu Bai","Xinyu Mao","Xiaoli Yin","Jingren Zhou","Yu Shi","Le Lu","Max Q. -H. Meng"],"pdf_url":"https://arxiv.org/pdf/2308.04911v1.pdf","comment":"accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.04904v1","updated":"2023-08-09T12:04:36Z","published":"2023-08-09T12:04:36Z","title":"StableVQA: A Deep No-Reference Quality Assessment Model for Video\n  Stability","summary":"  Video shakiness is an unpleasant distortion of User Generated Content (UGC)\nvideos, which is usually caused by the unstable hold of cameras. In recent\nyears, many video stabilization algorithms have been proposed, yet no specific\nand accurate metric enables comprehensively evaluating the stability of videos.\nIndeed, most existing quality assessment models evaluate video quality as a\nwhole without specifically taking the subjective experience of video stability\ninto consideration. Therefore, these models cannot measure the video stability\nexplicitly and precisely when severe shakes are present. In addition, there is\nno large-scale video database in public that includes various degrees of shaky\nvideos with the corresponding subjective scores available, which hinders the\ndevelopment of Video Quality Assessment for Stability (VQA-S). To this end, we\nbuild a new database named StableDB that contains 1,952 diversely-shaky UGC\nvideos, where each video has a Mean Opinion Score (MOS) on the degree of video\nstability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S\nmodel named StableVQA, which consists of three feature extractors to acquire\nthe optical flow, semantic, and blur features respectively, and a regression\nlayer to predict the final stability score. Extensive experiments demonstrate\nthat the StableVQA achieves a higher correlation with subjective opinions than\nthe existing VQA-S models and generic VQA models. The database and codes are\navailable at https://github.com/QMME/StableVQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Wei Sun","Jun Jia","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04899v1","updated":"2023-08-09T11:59:18Z","published":"2023-08-09T11:59:18Z","title":"Histogram-guided Video Colorization Structure with Spatial-Temporal\n  Connection","summary":"  Video colorization, aiming at obtaining colorful and plausible results from\ngrayish frames, has aroused a lot of interest recently. Nevertheless, how to\nmaintain temporal consistency while keeping the quality of colorized results\nremains challenging. To tackle the above problems, we present a\nHistogram-guided Video Colorization with Spatial-Temporal connection structure\n(named ST-HVC). To fully exploit the chroma and motion information, the joint\nflow and histogram module is tailored to integrate the histogram and flow\nfeatures. To manage the blurred and artifact, we design a combination scheme\nattending to temporal detail and flow feature combination. We further recombine\nthe histogram, flow and sharpness features via a U-shape network. Extensive\ncomparisons are conducted with several state-of-the-art image and video-based\nmethods, demonstrating that the developed method achieves excellent performance\nboth quantitatively and qualitatively in two video datasets.\n","authors":["Zheyuan Liu","Pan Mu","Hanning Xu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.04899v1.pdf","comment":"6 pages; Accepted at IEEE ICME"},{"id":"http://arxiv.org/abs/2212.01331v3","updated":"2023-08-09T11:44:16Z","published":"2022-12-02T17:46:55Z","title":"Surface Normal Clustering for Implicit Representation of Manhattan\n  Scenes","summary":"  Novel view synthesis and 3D modeling using implicit neural field\nrepresentation are shown to be very effective for calibrated multi-view\ncameras. Such representations are known to benefit from additional geometric\nand semantic supervision. Most existing methods that exploit additional\nsupervision require dense pixel-wise labels or localized scene priors. These\nmethods cannot benefit from high-level vague scene priors provided in terms of\nscenes' descriptions. In this work, we aim to leverage the geometric prior of\nManhattan scenes to improve the implicit neural radiance field representations.\nMore precisely, we assume that only the knowledge of the indoor scene (under\ninvestigation) being Manhattan is known -- with no additional information\nwhatsoever -- with an unknown Manhattan coordinate frame. Such high-level prior\nis used to self-supervise the surface normals derived explicitly in the\nimplicit neural fields. Our modeling allows us to cluster the derived normals\nand exploit their orthogonality constraints for self-supervision. Our\nexhaustive experiments on datasets of diverse indoor scenes demonstrate the\nsignificant benefit of the proposed method over the established baselines. The\nsource code will be available at\nhttps://github.com/nikola3794/normal-clustering-nerf.\n","authors":["Nikola Popovic","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2212.01331v3.pdf","comment":"Paper accepted to ICCV23"},{"id":"http://arxiv.org/abs/2308.04892v1","updated":"2023-08-09T11:43:54Z","published":"2023-08-09T11:43:54Z","title":"Transmission and Color-guided Network for Underwater Image Enhancement","summary":"  In recent years, with the continuous development of the marine industry,\nunderwater image enhancement has attracted plenty of attention. Unfortunately,\nthe propagation of light in water will be absorbed by water bodies and\nscattered by suspended particles, resulting in color deviation and low\ncontrast. To solve these two problems, we propose an Adaptive Transmission and\nDynamic Color guided network (named ATDCnet) for underwater image enhancement.\nIn particular, to exploit the knowledge of physics, we design an Adaptive\nTransmission-directed Module (ATM) to better guide the network. To deal with\nthe color deviation problem, we design a Dynamic Color-guided Module (DCM) to\npost-process the enhanced image color. Further, we design an\nEncoder-Decoder-based Compensation (EDC) structure with attention and a\nmulti-stage feature fusion mechanism to perform color restoration and contrast\nenhancement simultaneously. Extensive experiments demonstrate the\nstate-of-the-art performance of the ATDCnet on multiple benchmark datasets.\n","authors":["Pan Mu","Jing Fang","Haotian Qian","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.04892v1.pdf","comment":"6 pages; Accepted at IEEE ICME"},{"id":"http://arxiv.org/abs/2308.04886v1","updated":"2023-08-09T11:33:53Z","published":"2023-08-09T11:33:53Z","title":"Unsupervised Out-of-Distribution Dialect Detection with Mahalanobis\n  Distance","summary":"  Dialect classification is used in a variety of applications, such as machine\ntranslation and speech recognition, to improve the overall performance of the\nsystem. In a real-world scenario, a deployed dialect classification model can\nencounter anomalous inputs that differ from the training data distribution,\nalso called out-of-distribution (OOD) samples. Those OOD samples can lead to\nunexpected outputs, as dialects of those samples are unseen during model\ntraining. Out-of-distribution detection is a new research area that has\nreceived little attention in the context of dialect classification. Towards\nthis, we proposed a simple yet effective unsupervised Mahalanobis distance\nfeature-based method to detect out-of-distribution samples. We utilize the\nlatent embeddings from all intermediate layers of a wav2vec 2.0\ntransformer-based dialect classifier model for multi-task learning. Our\nproposed approach outperforms other state-of-the-art OOD detection methods\nsignificantly.\n","authors":["Sourya Dipta Das","Yash Vadi","Abhishek Unnam","Kuldeep Yadav"],"pdf_url":"https://arxiv.org/pdf/2308.04886v1.pdf","comment":"Accepted in Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.04883v1","updated":"2023-08-09T11:29:16Z","published":"2023-08-09T11:29:16Z","title":"Deep Generative Networks for Heterogeneous Augmentation of Cranial\n  Defects","summary":"  The design of personalized cranial implants is a challenging and tremendous\ntask that has become a hot topic in terms of process automation with the use of\ndeep learning techniques. The main challenge is associated with the high\ndiversity of possible cranial defects. The lack of appropriate data sources\nnegatively influences the data-driven nature of deep learning algorithms.\nHence, one of the possible solutions to overcome this problem is to rely on\nsynthetic data. In this work, we propose three volumetric variations of deep\ngenerative models to augment the dataset by generating synthetic skulls, i.e.\nWasserstein Generative Adversarial Network with Gradient Penalty (WGAN-GP),\nWGAN-GP hybrid with Variational Autoencoder pretraining (VAE/WGAN-GP) and\nIntrospective Variational Autoencoder (IntroVAE). We show that it is possible\nto generate dozens of thousands of defective skulls with compatible defects\nthat achieve a trade-off between defect heterogeneity and the realistic shape\nof the skull. We evaluate obtained synthetic data quantitatively by defect\nsegmentation with the use of V-Net and qualitatively by their latent space\nexploration. We show that the synthetically generated skulls highly improve the\nsegmentation process compared to using only the original unaugmented data. The\ngenerated skulls may improve the automatic design of personalized cranial\nimplants for real medical cases.\n","authors":["Kamil Kwarciak","Marek Wodzinski"],"pdf_url":"https://arxiv.org/pdf/2308.04883v1.pdf","comment":"9 pages, 5 figures, upcoming ICCV2023 LIMIT2023 Workshop"},{"id":"http://arxiv.org/abs/2308.04880v1","updated":"2023-08-09T11:23:32Z","published":"2023-08-09T11:23:32Z","title":"Learning multi-domain feature relation for visible and Long-wave\n  Infrared image patch matching","summary":"  Recently, learning-based algorithms have achieved promising performance on\ncross-spectral image patch matching, which, however, is still far from\nsatisfactory for practical application. On the one hand, a lack of large-scale\ndataset with diverse scenes haunts its further improvement for learning-based\nalgorithms, whose performances and generalization rely heavily on the dataset\nsize and diversity. On the other hand, more emphasis has been put on feature\nrelation in the spatial domain whereas the scale dependency between features\nhas often been ignored, leading to performance degeneration especially when\nencountering significant appearance variations for cross-spectral patches. To\naddress these issues, we publish, to be best of our knowledge, the largest\nvisible and Long-wave Infrared (LWIR) image patch matching dataset, termed\nVL-CMIM, which contains 1300 pairs of strictly aligned visible and LWIR images\nand over 2 million patch pairs covering diverse scenes such as asteroid, field,\ncountry, build, street and water.In addition, a multi-domain feature relation\nlearning network (MD-FRN) is proposed. Input by the features extracted from a\nfour-branch network, both feature relations in spatial and scale domains are\nlearned via a spatial correlation module (SCM) and multi-scale adaptive\naggregation module (MSAG), respectively. To further aggregate the multi-domain\nrelations, a deep domain interactive mechanism (DIM) is applied, where the\nlearnt spatial-relation and scale-relation features are exchanged and further\ninput into MSCRM and SCM. This mechanism allows our model to learn interactive\ncross-domain feature relations, leading to improved robustness to significant\nappearance changes due to different modality.\n","authors":["Xiuwei Zhang","Yanping Li","Zhaoshuai Qi","Yi Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04880v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.02347v4","updated":"2023-08-09T11:16:03Z","published":"2023-07-05T15:03:10Z","title":"Detecting Images Generated by Deep Diffusion Models using their Local\n  Intrinsic Dimensionality","summary":"  Diffusion models recently have been successfully applied for the visual\nsynthesis of strikingly realistic appearing images. This raises strong concerns\nabout their potential for malicious purposes. In this paper, we propose using\nthe lightweight multi Local Intrinsic Dimensionality (multiLID), which has been\noriginally developed in context of the detection of adversarial examples, for\nthe automatic detection of synthetic images and the identification of the\naccording generator networks. In contrast to many existing detection\napproaches, which often only work for GAN-generated images, the proposed method\nprovides close to perfect detection results in many realistic use cases.\nExtensive experiments on known and newly created datasets demonstrate that the\nproposed multiLID approach exhibits superiority in diffusion detection and\nmodel identification. Since the empirical evaluations of recent publications on\nthe detection of generated images are often mainly focused on the\n\"LSUN-Bedroom\" dataset, we further establish a comprehensive benchmark for the\ndetection of diffusion-generated images, including samples from several\ndiffusion models with different image sizes.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.02347v4.pdf","comment":"ICCV WS DFAD 2023"},{"id":"http://arxiv.org/abs/2308.04872v1","updated":"2023-08-09T11:10:11Z","published":"2023-08-09T11:10:11Z","title":"Tracking Players in a Badminton Court by Two Cameras","summary":"  This study proposes a simple method for multi-object tracking (MOT) of\nplayers in a badminton court. We leverage two off-the-shelf cameras, one on the\ntop of the court and the other on the side of the court. The one on the top is\nto track players' trajectories, while the one on the side is to analyze the\npixel features of players. By computing the correlations between adjacent\nframes and engaging the information of the two cameras, MOT of badminton\nplayers is obtained. This two-camera approach addresses the challenge of player\nocclusion and overlapping in a badminton court, providing player trajectory\ntracking and multi-angle analysis. The presented system offers insights into\nthe positions and movements of badminton players, thus serving as a coaching or\nself-training tool for badminton players to improve their gaming strategies.\n","authors":["Young-Ching Chou","Shen-Ru Zhang","Bo-Wei Chen","Hong-Qi Chen","Cheng-Kuan Lin","Yu-Chee Tseng"],"pdf_url":"https://arxiv.org/pdf/2308.04872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04868v1","updated":"2023-08-09T11:02:00Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":"  Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Gil Triginer Garces","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12217v2","updated":"2023-08-09T10:34:43Z","published":"2023-07-23T03:38:55Z","title":"LoLep: Single-View View Synthesis with Locally-Learned Planes and\n  Self-Attention Occlusion Inference","summary":"  We propose a novel method, LoLep, which regresses Locally-Learned planes from\na single RGB image to represent scenes accurately, thus generating better novel\nviews. Without the depth information, regressing appropriate plane locations is\na challenging problem. To solve this issue, we pre-partition the disparity\nspace into bins and design a disparity sampler to regress local offsets for\nmultiple planes in each bin. However, only using such a sampler makes the\nnetwork not convergent; we further propose two optimizing strategies that\ncombine with different disparity distributions of datasets and propose an\nocclusion-aware reprojection loss as a simple yet effective geometric\nsupervision technique. We also introduce a self-attention mechanism to improve\nocclusion inference and present a Block-Sampling Self-Attention (BS-SA) module\nto address the problem of applying self-attention to large feature maps. We\ndemonstrate the effectiveness of our approach and generate state-of-the-art\nresults on different datasets. Compared to MINE, our approach has an LPIPS\nreduction of 4.8%-9.0% and an RV reduction of 73.9%-83.5%. We also evaluate the\nperformance on real-world images and demonstrate the benefits.\n","authors":["Cong Wang","Yu-Ping Wang","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2307.12217v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.08975v3","updated":"2023-08-09T10:13:05Z","published":"2023-04-18T13:15:00Z","title":"Neural Architecture Search for Visual Anomaly Segmentation","summary":"  This paper presents the first application of neural architecture search to\nthe complex task of segmenting visual anomalies. Measurement of anomaly\nsegmentation performance is challenging due to imbalanced anomaly pixels,\nvarying region areas, and various types of anomalies. First, the\nregion-weighted Average Precision (rwAP) metric is proposed as an alternative\nto existing metrics, which does not need to be limited to a specific maximum\nfalse positive rate. Second, the AutoPatch neural architecture search method is\nproposed, which enables efficient segmentation of visual anomalies without any\ntraining. By leveraging a pre-trained supernet, a black-box optimization\nalgorithm can directly minimize computational complexity and maximize\nperformance on a small validation set of anomalous examples. Finally,\ncompelling results are presented on the widely studied MVTec dataset,\ndemonstrating that AutoPatch outperforms the current state-of-the-art with\nlower computational complexity, using only one example per type of anomaly. The\nresults highlight the potential of automated machine learning to optimize\nthroughput in industrial quality control. The code for AutoPatch is available\nat: https://github.com/tommiekerssies/AutoPatch\n","authors":["Tommie Kerssies","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2304.08975v3.pdf","comment":"Main track paper for the International Conference on Automated\n  Machine Learning (AutoML Conference), published in Proceedings of Machine\n  Learning Research (PMLR), 2023"},{"id":"http://arxiv.org/abs/2002.03729v4","updated":"2023-08-09T10:01:08Z","published":"2020-01-16T09:38:50Z","title":"A lightweight target detection algorithm based on Mobilenet Convolution","summary":"  Target detection algorithm based on deep learning needs high computer GPU\nconfiguration, even need to use high performance deep learning workstation,\nthis not only makes the cost increase, also greatly limits the realizability of\nthe ground, this paper introduces a kind of lightweight algorithm for target\ndetection under the condition of the balance accuracy and computational\nefficiency, MobileNet as Backbone performs parameter The processing speed is\n30fps on the RTX2060 card for images with the CNN separator layer. The\nprocessing speed is 30fps on the RTX2060 card for images with a resolution of\n320*320.\n","authors":["Nina Kuchuk","Shengquan Wang"],"pdf_url":"https://arxiv.org/pdf/2002.03729v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04834v1","updated":"2023-08-09T09:46:26Z","published":"2023-08-09T09:46:26Z","title":"View while Moving: Efficient Video Recognition in Long-untrimmed Videos","summary":"  Recent adaptive methods for efficient video recognition mostly follow the\ntwo-stage paradigm of \"preview-then-recognition\" and have achieved great\nsuccess on multiple video benchmarks. However, this two-stage paradigm involves\ntwo visits of raw frames from coarse-grained to fine-grained during inference\n(cannot be parallelized), and the captured spatiotemporal features cannot be\nreused in the second stage (due to varying granularity), being not friendly to\nefficiency and computation optimization. To this end, inspired by human\ncognition, we propose a novel recognition paradigm of \"View while Moving\" for\nefficient long-untrimmed video recognition. In contrast to the two-stage\nparadigm, our paradigm only needs to access the raw frame once. The two phases\nof coarse-grained sampling and fine-grained recognition are combined into\nunified spatiotemporal modeling, showing great performance. Moreover, we\ninvestigate the properties of semantic units in video and propose a\nhierarchical mechanism to efficiently capture and reason about the unit-level\nand video-level temporal semantics in long-untrimmed videos respectively.\nExtensive experiments on both long-untrimmed and short-trimmed videos\ndemonstrate that our approach outperforms state-of-the-art methods in terms of\naccuracy as well as efficiency, yielding new efficiency and accuracy trade-offs\nfor video spatiotemporal modeling.\n","authors":["Ye Tian","Mengyu Yang","Lanshan Zhang","Zhizhen Zhang","Yang Liu","Xiaohui Xie","Xirong Que","Wendong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04834v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.03281v2","updated":"2023-08-09T09:45:24Z","published":"2023-03-06T16:52:11Z","title":"Visual Place Recognition: A Tutorial","summary":"  Localization is an essential capability for mobile robots. A rapidly growing\nfield of research in this area is Visual Place Recognition (VPR), which is the\nability to recognize previously seen places in the world based solely on\nimages. This present work is the first tutorial paper on visual place\nrecognition. It unifies the terminology of VPR and complements prior research\nin two important directions: 1) It provides a systematic introduction for\nnewcomers to the field, covering topics such as the formulation of the VPR\nproblem, a general-purpose algorithmic pipeline, an evaluation methodology for\nVPR approaches, and the major challenges for VPR and how they may be addressed.\n2) As a contribution for researchers acquainted with the VPR problem, it\nexamines the intricacies of different VPR problem types regarding input, data\nprocessing, and output. The tutorial also discusses the subtleties behind the\nevaluation of VPR algorithms, e.g., the evaluation of a VPR system that has to\nfind all matching database images per query, as opposed to just a single match.\nPractical code examples in Python illustrate to prospective practitioners and\nresearchers how VPR is implemented and evaluated.\n","authors":["Stefan Schubert","Peer Neubert","Sourav Garg","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2303.03281v2.pdf","comment":"IEEE Robotics & Automation Magazine (RAM)"},{"id":"http://arxiv.org/abs/2308.04832v1","updated":"2023-08-09T09:40:34Z","published":"2023-08-09T09:40:34Z","title":"TSSR: A Truncated and Signed Square Root Activation Function for Neural\n  Networks","summary":"  Activation functions are essential components of neural networks. In this\npaper, we introduce a new activation function called the Truncated and Signed\nSquare Root (TSSR) function. This function is distinctive because it is odd,\nnonlinear, monotone and differentiable. Its gradient is continuous and always\npositive. Thanks to these properties, it has the potential to improve the\nnumerical stability of neural networks. Several experiments confirm that the\nproposed TSSR has better performance than other stat-of-the-art activation\nfunctions. The proposed function has significant implications for the\ndevelopment of neural network models and can be applied to a wide range of\napplications in fields such as computer vision, natural language processing,\nand speech recognition.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2308.04832v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16389"},{"id":"http://arxiv.org/abs/2308.04830v1","updated":"2023-08-09T09:38:14Z","published":"2023-08-09T09:38:14Z","title":"VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style\n  Transfer","summary":"  Current talking face generation methods mainly focus on speech-lip\nsynchronization. However, insufficient investigation on the facial talking\nstyle leads to a lifeless and monotonous avatar. Most previous works fail to\nimitate expressive styles from arbitrary video prompts and ensure the\nauthenticity of the generated video. This paper proposes an unsupervised\nvariational style transfer model (VAST) to vivify the neutral photo-realistic\navatars. Our model consists of three key components: a style encoder that\nextracts facial style representations from the given video prompts; a hybrid\nfacial expression decoder to model accurate speech-related movements; a\nvariational style enhancer that enhances the style space to be highly\nexpressive and meaningful. With our essential designs on facial style learning,\nour model is able to flexibly capture the expressive facial style from\narbitrary video prompts and transfer it onto a personalized image renderer in a\nzero-shot manner. Experimental results demonstrate the proposed approach\ncontributes to a more vivid talking avatar with higher authenticity and richer\nexpressiveness.\n","authors":["Liyang Chen","Zhiyong Wu","Runnan Li","Weihong Bao","Jun Ling","Xu Tan","Sheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.04830v1.pdf","comment":"Accepted by ICCV 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.04829v1","updated":"2023-08-09T09:35:16Z","published":"2023-08-09T09:35:16Z","title":"MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner\n  for Open-World Semantic Segmentation","summary":"  Recently, semantic segmentation models trained with image-level text\nsupervision have shown promising results in challenging open-world scenarios.\nHowever, these models still face difficulties in learning fine-grained semantic\nalignment at the pixel level and predicting accurate object masks. To address\nthis issue, we propose MixReorg, a novel and straightforward pre-training\nparadigm for semantic segmentation that enhances a model's ability to\nreorganize patches mixed across images, exploring both local visual relevance\nand global semantic coherence. Our approach involves generating fine-grained\npatch-text pairs data by mixing image patches while preserving the\ncorrespondence between patches and text. The model is then trained to minimize\nthe segmentation loss of the mixed images and the two contrastive losses of the\noriginal and restored features. With MixReorg as a mask learner, conventional\ntext-supervised semantic segmentation models can achieve highly generalizable\npixel-semantic alignment ability, which is crucial for open-world segmentation.\nAfter training with large-scale image-text data, MixReorg models can be applied\ndirectly to segment visual objects of arbitrary categories, without the need\nfor further fine-tuning. Our proposed framework demonstrates strong performance\non popular zero-shot semantic segmentation benchmarks, outperforming GroupViT\nby significant margins of 5.0%, 6.2%, 2.5%, and 3.4% mIoU on PASCAL VOC2012,\nPASCAL Context, MS COCO, and ADE20K, respectively.\n","authors":["Kaixin Cai","Pengzhen Ren","Yi Zhu","Hang Xu","Jianzhuang Liu","Changlin Li","Guangrun Wang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2308.04829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04828v1","updated":"2023-08-09T09:33:45Z","published":"2023-08-09T09:33:45Z","title":"Seeing in Flowing: Adapting CLIP for Action Recognition with Motion\n  Prompts Learning","summary":"  The Contrastive Language-Image Pre-training (CLIP) has recently shown\nremarkable generalization on \"zero-shot\" training and has applied to many\ndownstream tasks. We explore the adaptation of CLIP to achieve a more efficient\nand generalized action recognition method. We propose that the key lies in\nexplicitly modeling the motion cues flowing in video frames. To that end, we\ndesign a two-stream motion modeling block to capture motion and spatial\ninformation at the same time. And then, the obtained motion cues are utilized\nto drive a dynamic prompts learner to generate motion-aware prompts, which\ncontain much semantic information concerning human actions. In addition, we\npropose a multimodal communication block to achieve a collaborative learning\nand further improve the performance. We conduct extensive experiments on\nHMDB-51, UCF-101, and Kinetics-400 datasets. Our method outperforms most\nexisting state-of-the-art methods by a significant margin on \"few-shot\" and\n\"zero-shot\" training. We also achieve competitive performance on \"closed-set\"\ntraining with extremely few trainable parameters and additional computational\ncosts.\n","authors":["Qiang Wang","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2308.04828v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04826v1","updated":"2023-08-09T09:24:56Z","published":"2023-08-09T09:24:56Z","title":"WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields","summary":"  Neural Radiance Field (NeRF) has shown impressive performance in novel view\nsynthesis via implicit scene representation. However, it usually suffers from\npoor scalability as requiring densely sampled images for each new scene.\nSeveral studies have attempted to mitigate this problem by integrating\nMulti-View Stereo (MVS) technique into NeRF while they still entail a\ncumbersome fine-tuning process for new scenes. Notably, the rendering quality\nwill drop severely without this fine-tuning process and the errors mainly\nappear around the high-frequency features. In the light of this observation, we\ndesign WaveNeRF, which integrates wavelet frequency decomposition into MVS and\nNeRF to achieve generalizable yet high-quality synthesis without any per-scene\noptimization. To preserve high-frequency information when generating 3D feature\nvolumes, WaveNeRF builds Multi-View Stereo in the Wavelet domain by integrating\nthe discrete wavelet transform into the classical cascade MVS, which\ndisentangles high-frequency information explicitly. With that, disentangled\nfrequency features can be injected into classic NeRF via a novel hybrid neural\nrenderer to yield faithful high-frequency details, and an intuitive\nfrequency-guided sampling strategy can be designed to suppress artifacts around\nhigh-frequency regions. Extensive experiments over three widely studied\nbenchmarks show that WaveNeRF achieves superior generalizable radiance field\nmodeling when only given three images as input.\n","authors":["Muyu Xu","Fangneng Zhan","Jiahui Zhang","Yingchen Yu","Xiaoqin Zhang","Christian Theobalt","Ling Shao","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.04826v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04821v1","updated":"2023-08-09T09:22:49Z","published":"2023-08-09T09:22:49Z","title":"HyperCoil-Recon: A Hypernetwork-based Adaptive Coil Configuration Task\n  Switching Network for MRI Reconstruction","summary":"  Parallel imaging, a fast MRI technique, involves dynamic adjustments based on\nthe configuration i.e. number, positioning, and sensitivity of the coils with\nrespect to the anatomy under study. Conventional deep learning-based image\nreconstruction models have to be trained or fine-tuned for each configuration,\nposing a barrier to clinical translation, given the lack of computational\nresources and machine learning expertise for clinicians to train models at\ndeployment. Joint training on diverse datasets learns a single weight set that\nmight underfit to deviated configurations. We propose, HyperCoil-Recon, a\nhypernetwork-based coil configuration task-switching network for multi-coil MRI\nreconstruction that encodes varying configurations of the numbers of coils in a\nmulti-tasking perspective, posing each configuration as a task. The\nhypernetworks infer and embed task-specific weights into the reconstruction\nnetwork, 1) effectively utilizing the contextual knowledge of common and\nvarying image features among the various fields-of-view of the coils, and 2)\nenabling generality to unseen configurations at test time. Experiments reveal\nthat our approach 1) adapts on the fly to various unseen configurations up to\n32 coils when trained on lower numbers (i.e. 7 to 11) of randomly varying\ncoils, and to 120 deviated unseen configurations when trained on 18\nconfigurations in a single model, 2) matches the performance of coil\nconfiguration-specific models, and 3) outperforms configuration-invariant\nmodels with improvement margins of around 1 dB / 0.03 and 0.3 dB / 0.02 in PSNR\n/ SSIM for knee and brain data. Our code is available at\nhttps://github.com/sriprabhar/HyperCoil-Recon\n","authors":["Sriprabha Ramanarayanan","Mohammad Al Fahim","Rahul G. S.","Amrit Kumar Jethi","Keerthi Ram","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.04821v1.pdf","comment":"Accepted at the ICCV 2023 Workshop on Computer Vision for Automated\n  Medical Diagnosis (CVAMD), 8 pages, 2 columns"},{"id":"http://arxiv.org/abs/2304.04963v3","updated":"2023-08-09T09:15:52Z","published":"2023-04-11T04:18:56Z","title":"PlantDet: A benchmark for Plant Detection in the Three-Rivers-Source\n  Region","summary":"  The Three-River-Source region is a highly significant natural reserve in\nChina that harbors a plethora of botanical resources. To meet the practical\nrequirements of botanical research and intelligent plant management, we\nconstruct a dataset for Plant detection in the Three-River-Source region\n(PTRS). It comprises 21 types, 6965 high-resolution images of 2160*3840 pixels,\ncaptured by diverse sensors and platforms, and featuring objects of varying\nshapes and sizes. The PTRS presents us with challenges such as dense occlusion,\nvarying leaf resolutions, and high feature similarity among plants, prompting\nus to develop a novel object detection network named PlantDet. This network\nemploys a window-based efficient self-attention module (ST block) to generate\nrobust feature representation at multiple scales, improving the detection\nefficiency for small and densely-occluded objects. Our experimental results\nvalidate the efficacy of our proposed plant detection benchmark, with a\nprecision of 88.1%, a mean average precision (mAP) of 77.6%, and a higher\nrecall compared to the baseline. Additionally, our method effectively overcomes\nthe issue of missing small objects.\n","authors":["Huanhuan Li","Xuechao Zou","Yu-an Zhang","Jiangcai Zhaba","Guomei Li","Lamao Yongga"],"pdf_url":"https://arxiv.org/pdf/2304.04963v3.pdf","comment":"Accepted by ICANN 2023"},{"id":"http://arxiv.org/abs/2308.04808v1","updated":"2023-08-09T09:02:47Z","published":"2023-08-09T09:02:47Z","title":"Joint-Relation Transformer for Multi-Person Motion Prediction","summary":"  Multi-person motion prediction is a challenging problem due to the dependency\nof motion on both individual past movements and interactions with other people.\nTransformer-based methods have shown promising results on this task, but they\nmiss the explicit relation representation between joints, such as skeleton\nstructure and pairwise distance, which is crucial for accurate interaction\nmodeling. In this paper, we propose the Joint-Relation Transformer, which\nutilizes relation information to enhance interaction modeling and improve\nfuture motion prediction. Our relation information contains the relative\ndistance and the intra-/inter-person physical constraints. To fuse relation and\njoint information, we design a novel joint-relation fusion layer with\nrelation-aware attention to update both features. Additionally, we supervise\nthe relation information by forecasting future distance. Experiments show that\nour method achieves a 13.4% improvement of 900ms VIM on 3DPW-SoMoF/RC and\n17.8%/12.0% improvement of 3s MPJPE on CMU-Mpcap/MuPoTS-3D dataset.\n","authors":["Qingyao Xu","Weibo Mao","Jingze Gong","Chenxin Xu","Siheng Chen","Weidi Xie","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05116v3","updated":"2023-08-09T08:55:18Z","published":"2022-12-09T20:45:09Z","title":"Leveraging Contextual Data Augmentation for Generalizable Melanoma\n  Detection","summary":"  While skin cancer detection has been a valuable deep learning application for\nyears, its evaluation has often neglected the context in which testing images\nare assessed. Traditional melanoma classifiers assume that their testing\nenvironments are comparable to the structured images they are trained on. This\npaper challenges this notion and argues that mole size, a critical attribute in\nprofessional dermatology, can be misleading in automated melanoma detection.\nWhile malignant melanomas tend to be larger than benign melanomas, relying\nsolely on size can be unreliable and even harmful when contextual scaling of\nimages is not possible. To address this issue, this implementation proposes a\ncustom model that performs various data augmentation procedures to prevent\noverfitting to incorrect parameters and simulate real-world usage of melanoma\ndetection applications. Multiple custom models employing different forms of\ndata augmentation are implemented to highlight the most significant features of\nmole classifiers. These implementations emphasize the importance of considering\nuser unpredictability when deploying such applications. The caution required\nwhen manually modifying data is acknowledged, as it can result in data loss and\nbiased conclusions. Additionally, the significance of data augmentation in both\nthe dermatology and deep learning communities is considered.\n","authors":["Nick DiSanto","Gavin Harding","Ethan Martinez","Benjamin Sanders"],"pdf_url":"https://arxiv.org/pdf/2212.05116v3.pdf","comment":"6 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.04802v1","updated":"2023-08-09T08:51:03Z","published":"2023-08-09T08:51:03Z","title":"Generalized Unbiased Scene Graph Generation","summary":"  Existing Unbiased Scene Graph Generation (USGG) methods only focus on\naddressing the predicate-level imbalance that high-frequency classes dominate\npredictions of rare ones, while overlooking the concept-level imbalance.\nActually, even if predicates themselves are balanced, there is still a\nsignificant concept-imbalance within them due to the long-tailed distribution\nof contexts (i.e., subject-object combinations). This concept-level imbalance\nposes a more pervasive and challenging issue compared to the predicate-level\nimbalance since subject-object pairs are inherently complex in combinations.\nHence, we introduce a novel research problem: Generalized Unbiased Scene Graph\nGeneration (G-USGG), which takes into account both predicate-level and\nconcept-level imbalance. To the end, we propose the Multi-Concept Learning\n(MCL) framework, which ensures a balanced learning process across rare/\nuncommon/ common concepts. MCL first quantifies the concept-level imbalance\nacross predicates in terms of different amounts of concepts, representing as\nmultiple concept-prototypes within the same class. It then effectively learns\nconcept-prototypes by applying the Concept Regularization (CR) technique.\nFurthermore, to achieve balanced learning over different concepts, we introduce\nthe Balanced Prototypical Memory (BPM), which guides SGG models to generate\nbalanced representations for concept-prototypes. Extensive experiments\ndemonstrate the remarkable efficacy of our model-agnostic strategy in enhancing\nthe performance of benchmark models on both VG-SGG and OI-SGG datasets, leading\nto new state-of-the-art achievements in two key aspects: predicate-level\nunbiased relation recognition and concept-level compositional generability.\n","authors":["Xinyu Lyu","Lianli Gao","Junlin Xie","Pengpeng Zeng","Yulu Tian","Jie Shao","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01621v2","updated":"2023-08-09T08:42:24Z","published":"2023-08-03T08:50:48Z","title":"A Novel Convolutional Neural Network Architecture with a Continuous\n  Symmetry","summary":"  This paper introduces a new Convolutional Neural Network (ConvNet)\narchitecture inspired by a class of partial differential equations (PDEs)\ncalled quasi-linear hyperbolic systems. With comparable performance on the\nimage classification task, it allows for the modification of the weights via a\ncontinuous group of symmetry. This is a significant shift from traditional\nmodels where the architecture and weights are essentially fixed. We wish to\npromote the (internal) symmetry as a new desirable property for a neural\nnetwork, and to draw attention to the PDE perspective in analyzing and\ninterpreting ConvNets in the broader Deep Learning community.\n","authors":["Yao Liu","Hang Shao","Bing Bai"],"pdf_url":"https://arxiv.org/pdf/2308.01621v2.pdf","comment":"Accepted by the 3rd CAAI International Conference on Artificial\n  Intelligence (CICAI), 2023"},{"id":"http://arxiv.org/abs/2304.02942v2","updated":"2023-08-09T08:41:39Z","published":"2023-04-06T08:57:00Z","title":"InterFormer: Real-time Interactive Image Segmentation","summary":"  Interactive image segmentation enables annotators to efficiently perform\npixel-level annotation for segmentation tasks. However, the existing\ninteractive segmentation pipeline suffers from inefficient computations of\ninteractive models because of the following two issues. First, annotators'\nlater click is based on models' feedback of annotators' former click. This\nserial interaction is unable to utilize model's parallelism capabilities.\nSecond, in each interaction step, the model handles the invariant image along\nwith the sparse variable clicks, resulting in a process that's highly\nrepetitive and redundant. For efficient computations, we propose a method named\nInterFormer that follows a new pipeline to address these issues. InterFormer\nextracts and preprocesses the computationally time-consuming part i.e. image\nprocessing from the existing process. Specifically, InterFormer employs a large\nvision transformer (ViT) on high-performance devices to preprocess images in\nparallel, and then uses a lightweight module called interactive multi-head self\nattention (I-MSA) for interactive segmentation. Furthermore, the I-MSA module's\ndeployment on low-power devices extends the practical application of\ninteractive segmentation. The I-MSA module utilizes the preprocessed features\nto efficiently response to the annotator inputs in real-time. The experiments\non several datasets demonstrate the effectiveness of InterFormer, which\noutperforms previous interactive segmentation models in terms of computational\nefficiency and segmentation quality, achieve real-time high-quality interactive\nsegmentation on CPU-only devices. The code is available at\nhttps://github.com/YouHuang67/InterFormer.\n","authors":["You Huang","Hao Yang","Ke Sun","Shengchuan Zhang","Liujuan Cao","Guannan Jiang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2304.02942v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.04798v1","updated":"2023-08-09T08:36:13Z","published":"2023-08-09T08:36:13Z","title":"Enhancing Mobile Privacy and Security: A Face Skin Patch-Based\n  Anti-Spoofing Approach","summary":"  As Facial Recognition System(FRS) is widely applied in areas such as access\ncontrol and mobile payments due to its convenience and high accuracy. The\nsecurity of facial recognition is also highly regarded. The Face anti-spoofing\nsystem(FAS) for face recognition is an important component used to enhance the\nsecurity of face recognition systems. Traditional FAS used images containing\nidentity information to detect spoofing traces, however there is a risk of\nprivacy leakage during the transmission and storage of these images. Besides,\nthe encryption and decryption of these privacy-sensitive data takes too long\ncompared to inference time by FAS model. To address the above issues, we\npropose a face anti-spoofing algorithm based on facial skin patches leveraging\npure facial skin patch images as input, which contain no privacy information,\nno encryption or decryption is needed for these images. We conduct experiments\non several public datasets, the results prove that our algorithm has\ndemonstrated superiority in both accuracy and speed.\n","authors":["Qiushi Guo"],"pdf_url":"https://arxiv.org/pdf/2308.04798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04789v1","updated":"2023-08-09T08:28:25Z","published":"2023-08-09T08:28:25Z","title":"Multi-Scale Memory Comparison for Zero-/Few-Shot Anomaly Detection","summary":"  Anomaly detection has gained considerable attention due to its broad range of\napplications, particularly in industrial defect detection. To address the\nchallenges of data collection, researchers have introduced zero-/few-shot\nanomaly detection techniques that require minimal normal images for each\ncategory. However, complex industrial scenarios often involve multiple objects,\npresenting a significant challenge. In light of this, we propose a\nstraightforward yet powerful multi-scale memory comparison framework for\nzero-/few-shot anomaly detection. Our approach employs a global memory bank to\ncapture features across the entire image, while an individual memory bank\nfocuses on simplified scenes containing a single object. The efficacy of our\nmethod is validated by its remarkable achievement of 4th place in the zero-shot\ntrack and 2nd place in the few-shot track of the Visual Anomaly and Novelty\nDetection (VAND) competition.\n","authors":["Chaoqin Huang","Aofan Jiang","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04789v1.pdf","comment":"VAND Runner-up Winner in CVPR 2023"},{"id":"http://arxiv.org/abs/2203.01937v5","updated":"2023-08-09T08:16:47Z","published":"2022-03-03T08:04:59Z","title":"BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray\n  Classification","summary":"  Deep learning methods have shown outstanding classification accuracy in\nmedical imaging problems, which is largely attributed to the availability of\nlarge-scale datasets manually annotated with clean labels. However, given the\nhigh cost of such manual annotation, new medical imaging classification\nproblems may need to rely on machine-generated noisy labels extracted from\nradiology reports. Indeed, many Chest X-ray (CXR) classifiers have already been\nmodelled from datasets with noisy labels, but their training procedure is in\ngeneral not robust to noisy-label samples, leading to sub-optimal models.\nFurthermore, CXR datasets are mostly multi-label, so current noisy-label\nlearning methods designed for multi-class problems cannot be easily adapted. In\nthis paper, we propose a new method designed for the noisy multi-label CXR\nlearning, which detects and smoothly re-labels samples from the dataset, which\nis then used to train common multi-label classifiers. The proposed method\noptimises a bag of multi-label descriptors (BoMD) to promote their similarity\nwith the semantic descriptors produced by BERT models from the multi-label\nimage annotation. Our experiments on diverse noisy multi-label training sets\nand clean testing sets show that our model has state-of-the-art accuracy and\nrobustness in many CXR multi-label classification benchmarks.\n","authors":["Yuanhong Chen","Fengbei Liu","Hu Wang","Chong Wang","Yu Tian","Yuyuan Liu","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2203.01937v5.pdf","comment":"Code is available at https://github.com/cyh-0/BoMD"},{"id":"http://arxiv.org/abs/2308.04782v1","updated":"2023-08-09T08:13:46Z","published":"2023-08-09T08:13:46Z","title":"PointMBF: A Multi-scale Bidirectional Fusion Network for Unsupervised\n  RGB-D Point Cloud Registration","summary":"  Point cloud registration is a task to estimate the rigid transformation\nbetween two unaligned scans, which plays an important role in many computer\nvision applications. Previous learning-based works commonly focus on supervised\nregistration, which have limitations in practice. Recently, with the advance of\ninexpensive RGB-D sensors, several learning-based works utilize RGB-D data to\nachieve unsupervised registration. However, most of existing unsupervised\nmethods follow a cascaded design or fuse RGB-D data in a unidirectional manner,\nwhich do not fully exploit the complementary information in the RGB-D data. To\nleverage the complementary information more effectively, we propose a network\nimplementing multi-scale bidirectional fusion between RGB images and point\nclouds generated from depth images. By bidirectionally fusing visual and\ngeometric features in multi-scales, more distinctive deep features for\ncorrespondence estimation can be obtained, making our registration more\naccurate. Extensive experiments on ScanNet and 3DMatch demonstrate that our\nmethod achieves new state-of-the-art performance. Code will be released at\nhttps://github.com/phdymz/PointMBF\n","authors":["Mingzhi Yuan","Kexue Fu","Zhihao Li","Yucong Meng","Manning Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04782v1.pdf","comment":"Accepted to the ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04779v1","updated":"2023-08-09T08:06:28Z","published":"2023-08-09T08:06:28Z","title":"Multi-View Fusion and Distillation for Subgrade Distresses Detection\n  based on 3D-GPR","summary":"  The application of 3D ground-penetrating radar (3D-GPR) for subgrade distress\ndetection has gained widespread popularity. To enhance the efficiency and\naccuracy of detection, pioneering studies have attempted to adopt automatic\ndetection techniques, particularly deep learning. However, existing works\ntypically rely on traditional 1D A-scan, 2D B-scan or 3D C-scan data of the\nGPR, resulting in either insufficient spatial information or high computational\ncomplexity. To address these challenges, we introduce a novel methodology for\nthe subgrade distress detection task by leveraging the multi-view information\nfrom 3D-GPR data. Moreover, we construct a real multi-view image dataset\nderived from the original 3D-GPR data for the detection task, which provides\nricher spatial information compared to A-scan and B-scan data, while reducing\ncomputational complexity compared to C-scan data. Subsequently, we develop a\nnovel \\textbf{M}ulti-\\textbf{V}iew \\textbf{V}usion and \\textbf{D}istillation\nframework, \\textbf{GPR-MVFD}, specifically designed to optimally utilize the\nmulti-view GPR dataset. This framework ingeniously incorporates multi-view\ndistillation and attention-based fusion to facilitate significant feature\nextraction for subgrade distresses. In addition, a self-adaptive learning\nmechanism is adopted to stabilize the model training and prevent performance\ndegeneration in each branch. Extensive experiments conducted on this new GPR\nbenchmark demonstrate the effectiveness and efficiency of our proposed\nframework. Our framework outperforms not only the existing GPR baselines, but\nalso the state-of-the-art methods in the fields of multi-view learning,\nmulti-modal learning, and knowledge distillation. We will release the\nconstructed multi-view GPR dataset with expert-annotated labels and the source\ncodes of the proposed framework.\n","authors":["Chunpeng Zhou","Kangjie Ning","Haishuai Wang","Zhi Yu","Sheng Zhou","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2308.04779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04774v1","updated":"2023-08-09T08:02:11Z","published":"2023-08-09T08:02:11Z","title":"E3-UAV: An Edge-based Energy-Efficient Object Detection System for\n  Unmanned Aerial Vehicles","summary":"  Motivated by the advances in deep learning techniques, the application of\nUnmanned Aerial Vehicle (UAV)-based object detection has proliferated across a\nrange of fields, including vehicle counting, fire detection, and city\nmonitoring. While most existing research studies only a subset of the\nchallenges inherent to UAV-based object detection, there are few studies that\nbalance various aspects to design a practical system for energy consumption\nreduction. In response, we present the E3-UAV, an edge-based energy-efficient\nobject detection system for UAVs. The system is designed to dynamically support\nvarious UAV devices, edge devices, and detection algorithms, with the aim of\nminimizing energy consumption by deciding the most energy-efficient flight\nparameters (including flight altitude, flight speed, detection algorithm, and\nsampling rate) required to fulfill the detection requirements of the task. We\nfirst present an effective evaluation metric for actual tasks and construct a\ntransparent energy consumption model based on hundreds of actual flight data to\nformalize the relationship between energy consumption and flight parameters.\nThen we present a lightweight energy-efficient priority decision algorithm\nbased on a large quantity of actual flight data to assist the system in\ndeciding flight parameters. Finally, we evaluate the performance of the system,\nand our experimental results demonstrate that it can significantly decrease\nenergy consumption in real-world scenarios. Additionally, we provide four\ninsights that can assist researchers and engineers in their efforts to study\nUAV-based object detection further.\n","authors":["Jiashun Suo","Xingzhou Zhang","Weisong Shi","Wei Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.04774v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.04771v1","updated":"2023-08-09T07:58:33Z","published":"2023-08-09T07:58:33Z","title":"SUnAA: Sparse Unmixing using Archetypal Analysis","summary":"  This paper introduces a new sparse unmixing technique using archetypal\nanalysis (SUnAA). First, we design a new model based on archetypal analysis. We\nassume that the endmembers of interest are a convex combination of endmembers\nprovided by a spectral library and that the number of endmembers of interest is\nknown. Then, we propose a minimization problem. Unlike most conventional sparse\nunmixing methods, here the minimization problem is non-convex. We minimize the\noptimization objective iteratively using an active set algorithm. Our method is\nrobust to the initialization and only requires the number of endmembers of\ninterest. SUnAA is evaluated using two simulated datasets for which results\nconfirm its better performance over other conventional and advanced techniques\nin terms of signal-to-reconstruction error. SUnAA is also applied to Cuprite\ndataset and the results are compared visually with the available geological map\nprovided for this dataset. The qualitative assessment demonstrates the\nsuccessful estimation of the minerals abundances and significantly improves the\ndetection of dominant minerals compared to the conventional regression-based\nsparse unmixing methods. The Python implementation of SUnAA can be found at:\nhttps://github.com/BehnoodRasti/SUnAA.\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2308.04771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04770v1","updated":"2023-08-09T07:57:37Z","published":"2023-08-09T07:57:37Z","title":"Objects do not disappear: Video object detection by single-frame object\n  location anticipation","summary":"  Objects in videos are typically characterized by continuous smooth motion. We\nexploit continuous smooth motion in three ways. 1) Improved accuracy by using\nobject motion as an additional source of supervision, which we obtain by\nanticipating object locations from a static keyframe. 2) Improved efficiency by\nonly doing the expensive feature computations on a small subset of all frames.\nBecause neighboring video frames are often redundant, we only compute features\nfor a single static keyframe and predict object locations in subsequent frames.\n3) Reduced annotation cost, where we only annotate the keyframe and use smooth\npseudo-motion between keyframes. We demonstrate computational efficiency,\nannotation efficiency, and improved mean average precision compared to the\nstate-of-the-art on four datasets: ImageNet VID, EPIC KITCHENS-55,\nYouTube-BoundingBoxes, and Waymo Open dataset. Our source code is available at\nhttps://github.com/L-KID/Videoobject-detection-by-location-anticipation.\n","authors":["Xin Liu","Fatemeh Karimi Nejadasl","Jan C. van Gemert","Olaf Booij","Silvia L. Pintea"],"pdf_url":"https://arxiv.org/pdf/2308.04770v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04767v1","updated":"2023-08-09T07:55:12Z","published":"2023-08-09T07:55:12Z","title":"Induction Network: Audio-Visual Modality Gap-Bridging for\n  Self-Supervised Sound Source Localization","summary":"  Self-supervised sound source localization is usually challenged by the\nmodality inconsistency. In recent studies, contrastive learning based\nstrategies have shown promising to establish such a consistent correspondence\nbetween audio and sound sources in visual scenarios. Unfortunately, the\ninsufficient attention to the heterogeneity influence in the different modality\nfeatures still limits this scheme to be further improved, which also becomes\nthe motivation of our work. In this study, an Induction Network is proposed to\nbridge the modality gap more effectively. By decoupling the gradients of visual\nand audio modalities, the discriminative visual representations of sound\nsources can be learned with the designed Induction Vector in a bootstrap\nmanner, which also enables the audio modality to be aligned with the visual\nmodality consistently. In addition to a visual weighted contrastive loss, an\nadaptive threshold selection strategy is introduced to enhance the robustness\nof the Induction Network. Substantial experiments conducted on SoundNet-Flickr\nand VGG-Sound Source datasets have demonstrated a superior performance compared\nto other state-of-the-art works in different challenging scenarios. The code is\navailable at https://github.com/Tahy1/AVIN\n","authors":["Tianyu Liu","Peng Zhang","Wei Huang","Yufei Zha","Tao You","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04767v1.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04765v1","updated":"2023-08-09T07:53:33Z","published":"2023-08-09T07:53:33Z","title":"FaceSkin: A Privacy Preserving Facial skin patch Dataset for multi\n  Attributes classification","summary":"  Human facial skin images contain abundant textural information that can serve\nas valuable features for attribute classification, such as age, race, and\ngender. Additionally, facial skin images offer the advantages of easy\ncollection and minimal privacy concerns. However, the availability of\nwell-labeled human skin datasets with a sufficient number of images is limited.\nTo address this issue, we introduce a dataset called FaceSkin, which\nencompasses a diverse range of ages and races. Furthermore, to broaden the\napplication scenarios, we incorporate synthetic skin-patches obtained from 2D\nand 3D attack images, including printed paper, replays, and 3D masks. We\nevaluate the FaceSkin dataset across distinct categories and present\nexperimental results demonstrating its effectiveness in attribute\nclassification, as well as its potential for various downstream tasks, such as\nFace anti-spoofing and Age estimation.\n","authors":["Qiushi Guo","Shisha Liao"],"pdf_url":"https://arxiv.org/pdf/2308.04765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04758v1","updated":"2023-08-09T07:48:20Z","published":"2023-08-09T07:48:20Z","title":"Bird's-Eye-View Scene Graph for Vision-Language Navigation","summary":"  Vision-language navigation (VLN), which entails an agent to navigate 3D\nenvironments following human instructions, has shown great advances. However,\ncurrent agents are built upon panoramic observations, which hinders their\nability to perceive 3D scene geometry and easily leads to ambiguous selection\nof panoramic view. To address these limitations, we present a BEV Scene Graph\n(BSG), which leverages multi-step BEV representations to encode scene layouts\nand geometric cues of indoor environment under the supervision of 3D detection.\nDuring navigation, BSG builds a local BEV representation at each step and\nmaintains a BEV-based global scene map, which stores and organizes all the\nonline collected local BEV representations according to their topological\nrelations. Based on BSG, the agent predicts a local BEV grid-level decision\nscore and a global graph-level decision score, combined with a sub-view\nselection score on panoramic views, for more accurate action prediction. Our\napproach significantly outperforms state-of-the-art methods on REVERIE, R2R,\nand R4R, showing the potential of BEV perception in VLN.\n","authors":["Rui Liu","Xiaohan Wang","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.04758v1.pdf","comment":"Accepted at ICCV 2023; Project page:\n  https://github.com/DefaultRui/BEV-Scene-Graph"},{"id":"http://arxiv.org/abs/2308.04753v1","updated":"2023-08-09T07:45:51Z","published":"2023-08-09T07:45:51Z","title":"SAfER: Layer-Level Sensitivity Assessment for Efficient and Robust\n  Neural Network Inference","summary":"  Deep neural networks (DNNs) demonstrate outstanding performance across most\ncomputer vision tasks. Some critical applications, such as autonomous driving\nor medical imaging, also require investigation into their behavior and the\nreasons behind the decisions they make. In this vein, DNN attribution consists\nin studying the relationship between the predictions of a DNN and its inputs.\nAttribution methods have been adapted to highlight the most relevant weights or\nneurons in a DNN, allowing to more efficiently select which weights or neurons\ncan be pruned. However, a limitation of these approaches is that weights are\ntypically compared within each layer separately, while some layers might appear\nas more critical than others. In this work, we propose to investigate DNN layer\nimportance, i.e. to estimate the sensitivity of the accuracy w.r.t.\nperturbations applied at the layer level. To do so, we propose a novel dataset\nto evaluate our method as well as future works. We benchmark a number of\ncriteria and draw conclusions regarding how to assess DNN layer importance and,\nconsequently, how to budgetize layers for increased DNN efficiency (with\napplications for DNN pruning and quantization), as well as robustness to\nhardware failure (e.g. bit swaps).\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.04753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10438v2","updated":"2023-08-09T07:43:25Z","published":"2023-03-18T15:38:17Z","title":"Spatial-Aware Token for Weakly Supervised Object Localization","summary":"  Weakly supervised object localization (WSOL) is a challenging task aiming to\nlocalize objects with only image-level supervision. Recent works apply visual\ntransformer to WSOL and achieve significant success by exploiting the\nlong-range feature dependency in self-attention mechanism. However, existing\ntransformer-based methods synthesize the classification feature maps as the\nlocalization map, which leads to optimization conflicts between classification\nand localization tasks. To address this problem, we propose to learn a\ntask-specific spatial-aware token (SAT) to condition localization in a weakly\nsupervised manner. Specifically, a spatial token is first introduced in the\ninput space to aggregate representations for localization task. Then a spatial\naware attention module is constructed, which allows spatial token to generate\nforeground probabilities of different patches by querying and to extract\nlocalization knowledge from the classification task. Besides, for the problem\nof sparse and unbalanced pixel-level supervision obtained from the image-level\nlabel, two spatial constraints, including batch area loss and normalization\nloss, are designed to compensate and enhance this supervision. Experiments show\nthat the proposed SAT achieves state-of-the-art performance on both CUB-200 and\nImageNet, with 98.45% and 73.13% GT-known Loc, respectively. Even under the\nextreme setting of using only 1 image per class from ImageNet for training, SAT\nalready exceeds the SOTA method by 2.1% GT-known Loc. Code and models are\navailable at https://github.com/wpy1999/SAT.\n","authors":["Pingyu Wu","Wei Zhai","Yang Cao","Jiebo Luo","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2303.10438v2.pdf","comment":"Accepted by ICCV 2023. Code:https://github.com/wpy1999/SAT"},{"id":"http://arxiv.org/abs/2305.03892v2","updated":"2023-08-09T07:37:25Z","published":"2023-05-06T01:41:10Z","title":"DocDiff: Document Enhancement via Residual Diffusion Models","summary":"  Removing degradation from document images not only improves their visual\nquality and readability, but also enhances the performance of numerous\nautomated document analysis and recognition tasks. However, existing\nregression-based methods optimized for pixel-level distortion reduction tend to\nsuffer from significant loss of high-frequency information, leading to\ndistorted and blurred text edges. To compensate for this major deficiency, we\npropose DocDiff, the first diffusion-based framework specifically designed for\ndiverse challenging document enhancement problems, including document\ndeblurring, denoising, and removal of watermarks and seals. DocDiff consists of\ntwo modules: the Coarse Predictor (CP), which is responsible for recovering the\nprimary low-frequency content, and the High-Frequency Residual Refinement (HRR)\nmodule, which adopts the diffusion models to predict the residual\n(high-frequency information, including text edges), between the ground-truth\nand the CP-predicted image. DocDiff is a compact and computationally efficient\nmodel that benefits from a well-designed network architecture, an optimized\ntraining loss objective, and a deterministic sampling process with short time\nsteps. Extensive experiments demonstrate that DocDiff achieves state-of-the-art\n(SOTA) performance on multiple benchmark datasets, and can significantly\nenhance the readability and recognizability of degraded document images.\nFurthermore, our proposed HRR module in pre-trained DocDiff is plug-and-play\nand ready-to-use, with only 4.17M parameters. It greatly sharpens the text\nedges generated by SOTA deblurring methods without additional joint training.\nAvailable codes: https://github.com/Royalvice/DocDiff\n","authors":["Zongyuan Yang","Baolin Liu","Yongping Xiong","Lan Yi","Guibin Wu","Xiaojun Tang","Ziqi Liu","Junjie Zhou","Xing Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.03892v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2206.14797v4","updated":"2023-08-09T07:34:03Z","published":"2022-06-29T17:56:03Z","title":"3D-Aware Video Generation","summary":"  Generative models have emerged as an essential building block for many image\nsynthesis and editing tasks. Recent advances in this field have also enabled\nhigh-quality 3D or video content to be generated that exhibits either\nmulti-view or temporal consistency. With our work, we explore 4D generative\nadversarial networks (GANs) that learn unconditional generation of 3D-aware\nvideos. By combining neural implicit representations with time-aware\ndiscriminator, we develop a GAN framework that synthesizes 3D video supervised\nonly with monocular videos. We show that our method learns a rich embedding of\ndecomposable 3D structures and motions that enables new visual effects of\nspatio-temporal renderings while producing imagery with quality comparable to\nthat of existing 3D or video GANs.\n","authors":["Sherwin Bahmani","Jeong Joon Park","Despoina Paschalidou","Hao Tang","Gordon Wetzstein","Leonidas Guibas","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2206.14797v4.pdf","comment":"TMLR 2023; Project page: https://sherwinbahmani.github.io/3dvidgen"},{"id":"http://arxiv.org/abs/2211.02408v3","updated":"2023-08-09T07:29:57Z","published":"2022-11-04T12:36:36Z","title":"Rickrolling the Artist: Injecting Backdoors into Text Encoders for\n  Text-to-Image Synthesis","summary":"  While text-to-image synthesis currently enjoys great popularity among\nresearchers and the general public, the security of these models has been\nneglected so far. Many text-guided image generation models rely on pre-trained\ntext encoders from external sources, and their users trust that the retrieved\nmodels will behave as promised. Unfortunately, this might not be the case. We\nintroduce backdoor attacks against text-guided generative models and\ndemonstrate that their text encoders pose a major tampering risk. Our attacks\nonly slightly alter an encoder so that no suspicious model behavior is apparent\nfor image generations with clean prompts. By then inserting a single character\ntrigger into the prompt, e.g., a non-Latin character or emoji, the adversary\ncan trigger the model to either generate images with pre-defined attributes or\nimages following a hidden, potentially malicious description. We empirically\ndemonstrate the high effectiveness of our attacks on Stable Diffusion and\nhighlight that the injection process of a single backdoor takes less than two\nminutes. Besides phrasing our approach solely as an attack, it can also force\nan encoder to forget phrases related to certain concepts, such as nudity or\nviolence, and help to make image generation safer.\n","authors":["Lukas Struppek","Dominik Hintersdorf","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2211.02408v3.pdf","comment":"Published as a conference paper at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.01097v2","updated":"2023-08-09T07:21:53Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10437v2","updated":"2023-08-09T07:11:11Z","published":"2023-03-18T15:37:35Z","title":"Grounding 3D Object Affordance from 2D Interactions in Images","summary":"  Grounding 3D object affordance seeks to locate objects' ''action\npossibilities'' regions in the 3D space, which serves as a link between\nperception and operation for embodied agents. Existing studies primarily focus\non connecting visual affordances with geometry structures, e.g. relying on\nannotations to declare interactive regions of interest on the object and\nestablishing a mapping between the regions and affordances. However, the\nessence of learning object affordance is to understand how to use it, and the\nmanner that detaches interactions is limited in generalization. Normally,\nhumans possess the ability to perceive object affordances in the physical world\nthrough demonstration images or videos. Motivated by this, we introduce a novel\ntask setting: grounding 3D object affordance from 2D interactions in images,\nwhich faces the challenge of anticipating affordance through interactions of\ndifferent sources. To address this problem, we devise a novel\nInteraction-driven 3D Affordance Grounding Network (IAG), which aligns the\nregion feature of objects from different sources and models the interactive\ncontexts for 3D object affordance grounding. Besides, we collect a Point-Image\nAffordance Dataset (PIAD) to support the proposed task. Comprehensive\nexperiments on PIAD demonstrate the reliability of the proposed task and the\nsuperiority of our method. The project is available at\nhttps://github.com/yyvhang/IAGNet.\n","authors":["Yuhang Yang","Wei Zhai","Hongchen Luo","Yang Cao","Jiebo Luo","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2303.10437v2.pdf","comment":"ICCV2023, camera-ready version"},{"id":"http://arxiv.org/abs/2308.04733v1","updated":"2023-08-09T06:59:29Z","published":"2023-08-09T06:59:29Z","title":"TextPainter: Multimodal Text Image Generation withVisual-harmony and\n  Text-comprehension for Poster Design","summary":"  Text design is one of the most critical procedures in poster design, as it\nrelies heavily on the creativity and expertise of humans to design text images\nconsidering the visual harmony and text-semantic. This study introduces\nTextPainter, a novel multimodal approach that leverages contextual visual\ninformation and corresponding text semantics to generate text images.\nSpecifically, TextPainter takes the global-local background image as a hint of\nstyle and guides the text image generation with visual harmony. Furthermore, we\nleverage the language model and introduce a text comprehension module to\nachieve both sentence-level and word-level style variations. Besides, we\nconstruct the PosterT80K dataset, consisting of about 80K posters annotated\nwith sentence-level bounding boxes and text contents. We hope this dataset will\npave the way for further research on multimodal text image generation.\nExtensive quantitative and qualitative experiments demonstrate that TextPainter\ncan generatevisually-and-semantically-harmonious text images for posters.\n","authors":["Yifan Gao","Jinpeng Lin","Min Zhou","Chuanbin Liu","Hongtao Xie","Tiezheng Ge","Yuning Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.04733v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2305.03210v2","updated":"2023-08-09T06:24:55Z","published":"2023-05-04T23:46:49Z","title":"AttentionViz: A Global View of Transformer Attention","summary":"  Transformer models are revolutionizing machine learning, but their inner\nworkings remain mysterious. In this work, we present a new visualization\ntechnique designed to help researchers understand the self-attention mechanism\nin transformers that allows these models to learn rich, contextual\nrelationships between elements of a sequence. The main idea behind our method\nis to visualize a joint embedding of the query and key vectors used by\ntransformer models to compute attention. Unlike previous attention\nvisualization techniques, our approach enables the analysis of global patterns\nacross multiple input sequences. We create an interactive visualization tool,\nAttentionViz (demo: http://attentionviz.com), based on these joint query-key\nembeddings, and use it to study attention mechanisms in both language and\nvision transformers. We demonstrate the utility of our approach in improving\nmodel understanding and offering new insights about query-key interactions\nthrough several application scenarios and expert feedback.\n","authors":["Catherine Yeh","Yida Chen","Aoyu Wu","Cynthia Chen","Fernanda Viégas","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2305.03210v2.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.04725v1","updated":"2023-08-09T06:03:07Z","published":"2023-08-09T06:03:07Z","title":"Self-supervised Learning of Rotation-invariant 3D Point Set Features\n  using Transformer and its Self-distillation","summary":"  Invariance against rotations of 3D objects is an important property in\nanalyzing 3D point set data. Conventional 3D point set DNNs having rotation\ninvariance typically obtain accurate 3D shape features via supervised learning\nby using labeled 3D point sets as training samples. However, due to the rapid\nincrease in 3D point set data and the high cost of labeling, a framework to\nlearn rotation-invariant 3D shape features from numerous unlabeled 3D point\nsets is required. This paper proposes a novel self-supervised learning\nframework for acquiring accurate and rotation-invariant 3D point set features\nat object-level. Our proposed lightweight DNN architecture decomposes an input\n3D point set into multiple global-scale regions, called tokens, that preserve\nthe spatial layout of partial shapes composing the 3D object. We employ a\nself-attention mechanism to refine the tokens and aggregate them into an\nexpressive rotation-invariant feature per 3D point set. Our DNN is effectively\ntrained by using pseudo-labels generated by a self-distillation framework. To\nfacilitate the learning of accurate features, we propose to combine multi-crop\nand cut-mix data augmentation techniques to diversify 3D point sets for\ntraining. Through a comprehensive evaluation, we empirically demonstrate that,\n(1) existing rotation-invariant DNN architectures designed for supervised\nlearning do not necessarily learn accurate 3D shape features under a\nself-supervised learning scenario, and (2) our proposed algorithm learns\nrotation-invariant 3D point set features that are more accurate than those\nlearned by existing algorithms. Code will be available at\nhttps://github.com/takahikof/RIPT_SDMM\n","authors":["Takahiko Furuya","Zhoujie Chen","Ryutarou Ohbuchi","Zhenzhong Kuang"],"pdf_url":"https://arxiv.org/pdf/2308.04725v1.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2303.16839v3","updated":"2023-08-09T05:39:34Z","published":"2023-03-29T16:42:30Z","title":"MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks","summary":"  The development of language models have moved from encoder-decoder to\ndecoder-only designs. In addition, we observe that the two most popular\nmultimodal tasks, the generative and contrastive tasks, are nontrivial to\naccommodate in one architecture, and further need adaptations for downstream\ntasks. We propose a novel paradigm of training with a decoder-only model for\nmultimodal tasks, which is surprisingly effective in jointly learning of these\ndisparate vision-language tasks. This is done with a simple model, called\nMaMMUT. It consists of a single vision encoder and a text decoder, and is able\nto accommodate contrastive and generative learning by a novel two-pass approach\non the text decoder. We demonstrate that joint learning of these diverse\nobjectives is simple, effective, and maximizes the weight-sharing of the model\nacross these tasks. Furthermore, the same architecture enables straightforward\nextensions to open-vocabulary object detection and video-language tasks. The\nmodel tackles a diverse range of tasks, while being modest in capacity. Our\nmodel achieves the state of the art on image-text and text-image retrieval,\nvideo question answering and open-vocabulary detection tasks, outperforming\nmuch larger and more extensively trained foundational models. It shows very\ncompetitive results on VQA and Video Captioning, especially considering its\ncapacity. Ablations confirm the flexibility and advantages of our approach.\n","authors":["Weicheng Kuo","AJ Piergiovanni","Dahun Kim","Xiyang Luo","Ben Caine","Wei Li","Abhijit Ogale","Luowei Zhou","Andrew Dai","Zhifeng Chen","Claire Cui","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2303.16839v3.pdf","comment":"Published in Transactions on Machine Learning Research (\n  https://jmlr.org/tmlr/ ). 18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.06108v3","updated":"2023-08-09T05:36:43Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n  Detection System","summary":"  In autonomous driving systems, LiDAR and radar play important roles in the\nperception of the surrounding environment. LiDAR provides accurate 3D spatial\nsensing information but cannot work in adverse weather like fog. On the other\nhand, the radar signal can be diffracted when encountering raindrops or mist\nparticles thanks to its wavelength, but it suffers from large noise. Recent\nstate-of-the-art works reveal that fusion of radar and LiDAR can lead to robust\ndetection in adverse weather. The existing works adopt convolutional neural\nnetwork architecture to extract features from each sensor data stream, then\nalign and aggregate the two branch features to predict object detection\nresults. However, these methods have low accuracy of bounding box estimations\ndue to a simple design of label assignment and fusion strategies. In this\npaper, we propose a bird's-eye view fusion learning-based anchor box-free\nobject detection system, which fuses the feature derived from the radar\nrange-azimuth heatmap and the LiDAR point cloud to estimate the possible\nobjects. Different label assignment strategies have been designed to facilitate\nthe consistency between the classification of foreground or background anchor\npoints and the corresponding bounding box regressions. In addition, the\nperformance of the proposed object detector is further enhanced by employing a\nnovel interactive transformer module. The superior performance of the methods\nproposed in this paper has been demonstrated using the recently published\nOxford Radar RobotCar dataset. Our system's average precision significantly\noutperforms the best state-of-the-art method by 13.1% and 19.0% at IoU of 0.8\nunder 'Clear+Foggy' training conditions for 'Clear' and 'Foggy' testing,\nrespectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v3.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.04702v1","updated":"2023-08-09T04:46:16Z","published":"2023-08-09T04:46:16Z","title":"Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric\n  Multi-Modal Network","summary":"  State-of-the-art multimodal semantic segmentation approaches combining LiDAR\nand color data are usually designed on top of asymmetric information-sharing\nschemes and assume that both modalities are always available. Regrettably, this\nstrong assumption may not hold in real-world scenarios, where sensors are prone\nto failure or can face adverse conditions (night-time, rain, fog, etc.) that\nmake the acquired information unreliable. Moreover, these architectures tend to\nfail in continual learning scenarios. In this work, we re-frame the task of\nmultimodal semantic segmentation by enforcing a tightly-coupled feature\nrepresentation and a symmetric information-sharing scheme, which allows our\napproach to work even when one of the input modalities is missing. This makes\nour model reliable even in safety-critical settings, as is the case of\nautonomous driving. We evaluate our approach on the SemanticKITTI dataset,\ncomparing it with our closest competitor. We also introduce an ad-hoc continual\nlearning scheme and show results in a class-incremental continual learning\nscenario that prove the effectiveness of the approach also in this setting.\n","authors":["Francesco Barbato","Elena Camuffo","Simone Milani","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2308.04702v1.pdf","comment":"10 pages, 6 figures, 2 tables, 8 equations"},{"id":"http://arxiv.org/abs/2308.04699v1","updated":"2023-08-09T04:34:21Z","published":"2023-08-09T04:34:21Z","title":"GIFD: A Generative Gradient Inversion Method with Feature Domain\n  Optimization","summary":"  Federated Learning (FL) has recently emerged as a promising distributed\nmachine learning framework to preserve clients' privacy, by allowing multiple\nclients to upload the gradients calculated from their local data to a central\nserver. Recent studies find that the exchanged gradients also take the risk of\nprivacy leakage, e.g., an attacker can invert the shared gradients and recover\nsensitive data against an FL system by leveraging pre-trained generative\nadversarial networks (GAN) as prior knowledge. However, performing gradient\ninversion attacks in the latent space of the GAN model limits their expression\nability and generalizability. To tackle these challenges, we propose\n\\textbf{G}radient \\textbf{I}nversion over \\textbf{F}eature \\textbf{D}omains\n(GIFD), which disassembles the GAN model and searches the feature domains of\nthe intermediate layers. Instead of optimizing only over the initial latent\ncode, we progressively change the optimized layer, from the initial latent\nspace to intermediate layers closer to the output images. In addition, we\ndesign a regularizer to avoid unreal image generation by adding a small ${l_1}$\nball constraint to the searching range. We also extend GIFD to the\nout-of-distribution (OOD) setting, which weakens the assumption that the\ntraining sets of GANs and FL tasks obey the same data distribution. Extensive\nexperiments demonstrate that our method can achieve pixel-level reconstruction\nand is superior to the existing methods. Notably, GIFD also shows great\ngeneralizability under different defense strategy settings and batch sizes.\n","authors":["Hao Fang","Bin Chen","Xuan Wang","Zhi Wang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2308.04699v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00247v2","updated":"2023-08-09T04:30:16Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n  Review","summary":"  The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Yuanzhou Wei","Xiao Yang","Yuan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.00247v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2307.01097v3","updated":"2023-08-09T04:15:22Z","published":"2023-07-03T15:19:17Z","title":"MVDiffusion: Enabling Holistic Multi-view Image Generation with\n  Correspondence-Aware Diffusion","summary":"  This paper introduces MVDiffusion, a simple yet effective multi-view image\ngeneration method for scenarios where pixel-to-pixel correspondences are\navailable, such as perspective crops from panorama or multi-view images given\ngeometry (depth maps and poses). Unlike prior models that rely on iterative\nimage warping and inpainting, MVDiffusion concurrently generates all images\nwith a global awareness, encompassing high resolution and rich content,\neffectively addressing the error accumulation prevalent in preceding models.\nMVDiffusion specifically incorporates a correspondence-aware attention\nmechanism, enabling effective cross-view interaction. This mechanism underpins\nthree pivotal modules: 1) a generation module that produces low-resolution\nimages while maintaining global correspondence, 2) an interpolation module that\ndensifies spatial coverage between images, and 3) a super-resolution module\nthat upscales into high-resolution outputs. In terms of panoramic imagery,\nMVDiffusion can generate high-resolution photorealistic images up to\n1024$\\times$1024 pixels. For geometry-conditioned multi-view image generation,\nMVDiffusion demonstrates the first method capable of generating a textured map\nof a scene mesh. The project page is at https://mvdiffusion.github.io.\n","authors":["Shitao Tang","Fuyang Zhang","Jiacheng Chen","Peng Wang","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2307.01097v3.pdf","comment":"Project page, https://mvdiffusion.github.io"},{"id":"http://arxiv.org/abs/2308.04687v1","updated":"2023-08-09T03:49:12Z","published":"2023-08-09T03:49:12Z","title":"Rapid Training Data Creation by Synthesizing Medical Images for\n  Classification and Localization","summary":"  While the use of artificial intelligence (AI) for medical image analysis is\ngaining wide acceptance, the expertise, time and cost required to generate\nannotated data in the medical field are significantly high, due to limited\navailability of both data and expert annotation. Strongly supervised object\nlocalization models require data that is exhaustively annotated, meaning all\nobjects of interest in an image are identified. This is difficult to achieve\nand verify for medical images. We present a method for the transformation of\nreal data to train any Deep Neural Network to solve the above problems. We show\nthe efficacy of this approach on both a weakly supervised localization model\nand a strongly supervised localization model. For the weakly supervised model,\nwe show that the localization accuracy increases significantly using the\ngenerated data. For the strongly supervised model, this approach overcomes the\nneed for exhaustive annotation on real images. In the latter model, we show\nthat the accuracy, when trained with generated images, closely parallels the\naccuracy when trained with exhaustively annotated real images. The results are\ndemonstrated on images of human urine samples obtained using microscopy.\n","authors":["Abhishek Kushwaha","Sarthak Gupta","Anish Bhanushali","Tathagato Rai Dastidar"],"pdf_url":"https://arxiv.org/pdf/2308.04687v1.pdf","comment":"https://openaccess.thecvf.com/content_CVPRW_2020/html/w57/Kushwaha_Rapid_Training_Data_Creation_by_Synthesizing_Medical_Images_for_Classification_CVPRW_2020_paper.html"},{"id":"http://arxiv.org/abs/2308.04682v1","updated":"2023-08-09T03:26:58Z","published":"2023-08-09T03:26:58Z","title":"Score Priors Guided Deep Variational Inference for Unsupervised\n  Real-World Single Image Denoising","summary":"  Real-world single image denoising is crucial and practical in computer\nvision. Bayesian inversions combined with score priors now have proven\neffective for single image denoising but are limited to white Gaussian noise.\nMoreover, applying existing score-based methods for real-world denoising\nrequires not only the explicit train of score priors on the target domain but\nalso the careful design of sampling procedures for posterior inference, which\nis complicated and impractical. To address these limitations, we propose a\nscore priors-guided deep variational inference, namely ScoreDVI, for practical\nreal-world denoising. By considering the deep variational image posterior with\na Gaussian form, score priors are extracted based on easily accessible minimum\nMSE Non-$i.i.d$ Gaussian denoisers and variational samples, which in turn\nfacilitate optimizing the variational image posterior. Such a procedure\nadaptively applies cheap score priors to denoising. Additionally, we exploit a\nNon-$i.i.d$ Gaussian mixture model and variational noise posterior to model the\nreal-world noise. This scheme also enables the pixel-wise fusion of multiple\nimage priors and variational image posteriors. Besides, we develop a\nnoise-aware prior assignment strategy that dynamically adjusts the weight of\nimage priors in the optimization. Our method outperforms other single\nimage-based real-world denoising methods and achieves comparable performance to\ndataset-based unsupervised methods.\n","authors":["Jun Cheng","Tao Liu","Shan Tan"],"pdf_url":"https://arxiv.org/pdf/2308.04682v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.08209v2","updated":"2023-08-09T03:18:27Z","published":"2023-07-17T02:58:51Z","title":"Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for\n  Efficient 3D Object Detection","summary":"  Voxel-based methods have achieved state-of-the-art performance for 3D object\ndetection in autonomous driving. However, their significant computational and\nmemory costs pose a challenge for their application to resource-constrained\nvehicles. One reason for this high resource consumption is the presence of a\nlarge number of redundant background points in Lidar point clouds, resulting in\nspatial redundancy in both 3D voxel and dense BEV map representations. To\naddress this issue, we propose an adaptive inference framework called Ada3D,\nwhich focuses on exploiting the input-level spatial redundancy. Ada3D\nadaptively filters the redundant input, guided by a lightweight importance\npredictor and the unique properties of the Lidar point cloud. Additionally, we\nutilize the BEV features' intrinsic sparsity by introducing the Sparsity\nPreserving Batch Normalization. With Ada3D, we achieve 40% reduction for 3D\nvoxels and decrease the density of 2D BEV feature maps from 100% to 20% without\nsacrificing accuracy. Ada3D reduces the model computational and memory cost by\n5x, and achieves 1.52x/1.45x end-to-end GPU latency and 1.5x/4.5x GPU peak\nmemory optimization for the 3D and 2D backbone respectively.\n","authors":["Tianchen Zhao","Xuefei Ning","Ke Hong","Zhongyuan Qiu","Pu Lu","Yali Zhao","Linfeng Zhang","Lipu Zhou","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.08209v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2307.13567v2","updated":"2023-08-09T03:12:30Z","published":"2023-07-25T15:20:19Z","title":"Mystique: Deconstructing SVG Charts for Layout Reuse","summary":"  To facilitate the reuse of existing charts, previous research has examined\nhow to obtain a semantic understanding of a chart by deconstructing its visual\nrepresentation into reusable components, such as encodings. However, existing\ndeconstruction approaches primarily focus on chart styles, handling only basic\nlayouts. In this paper, we investigate how to deconstruct chart layouts,\nfocusing on rectangle-based ones, as they cover not only 17 chart types but\nalso advanced layouts (e.g., small multiples, nested layouts). We develop an\ninteractive tool, called Mystique, adopting a mixed-initiative approach to\nextract the axes and legend, and deconstruct a chart's layout into four\nsemantic components: mark groups, spatial relationships, data encodings, and\ngraphical constraints. Mystique employs a wizard interface that guides chart\nauthors through a series of steps to specify how the deconstructed components\nmap to their own data. On 150 rectangle-based SVG charts, Mystique achieves\nabove 85% accuracy for axis and legend extraction and 96% accuracy for layout\ndeconstruction. In a chart reproduction study, participants could easily reuse\nexisting charts on new datasets. We discuss the current limitations of Mystique\nand future research directions.\n","authors":["Chen Chen","Bongshin Lee","Yunhai Wang","Yunjeong Chang","Zhicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13567v2.pdf","comment":"To appear at the 2023 IEEE Visualization Conference"},{"id":"http://arxiv.org/abs/2308.04674v1","updated":"2023-08-09T03:03:35Z","published":"2023-08-09T03:03:35Z","title":"Addressing Racial Bias in Facial Emotion Recognition","summary":"  Fairness in deep learning models trained with high-dimensional inputs and\nsubjective labels remains a complex and understudied area. Facial emotion\nrecognition, a domain where datasets are often racially imbalanced, can lead to\nmodels that yield disparate outcomes across racial groups. This study focuses\non analyzing racial bias by sub-sampling training sets with varied racial\ndistributions and assessing test performance across these simulations. Our\nfindings indicate that smaller datasets with posed faces improve on both\nfairness and performance metrics as the simulations approach racial balance.\nNotably, the F1-score increases by $27.2\\%$ points, and demographic parity\nincreases by $15.7\\%$ points on average across the simulations. However, in\nlarger datasets with greater facial variation, fairness metrics generally\nremain constant, suggesting that racial balance by itself is insufficient to\nachieve parity in test performance across different racial groups.\n","authors":["Alex Fan","Xingshuo Xiao","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2308.04674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04672v1","updated":"2023-08-09T02:50:15Z","published":"2023-08-09T02:50:15Z","title":"Resource Constrained Model Compression via Minimax Optimization for\n  Spiking Neural Networks","summary":"  Brain-inspired Spiking Neural Networks (SNNs) have the characteristics of\nevent-driven and high energy-efficient, which are different from traditional\nArtificial Neural Networks (ANNs) when deployed on edge devices such as\nneuromorphic chips. Most previous work focuses on SNNs training strategies to\nimprove model performance and brings larger and deeper network architectures.\nIt is difficult to deploy these complex networks on resource-limited edge\ndevices directly. To meet such demand, people compress SNNs very cautiously to\nbalance the performance and the computation efficiency. Existing compression\nmethods either iteratively pruned SNNs using weights norm magnitude or\nformulated the problem as a sparse learning optimization. We propose an\nimproved end-to-end Minimax optimization method for this sparse learning\nproblem to better balance the model performance and the computation efficiency.\nWe also demonstrate that jointly applying compression and finetuning on SNNs is\nbetter than sequentially, especially for extreme compression ratios. The\ncompressed SNN models achieved state-of-the-art (SOTA) performance on various\nbenchmark datasets and architectures. Our code is available at\nhttps://github.com/chenjallen/Resource-Constrained-Compression-on-SNN.\n","authors":["Jue Chen","Huan Yuan","Jianchao Tan","Bin Chen","Chengru Song","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04672v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04669v1","updated":"2023-08-09T02:27:23Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":"  Recently, a variety of Neural radiance fields methods have garnered\nremarkable success in high render speed. However, current accelerating methods\nis specialized and not compatible for various implicit method, which prevent a\nreal-time composition over different kinds of NeRF works. Since NeRF relies on\nsampling along rays, it's possible to provide a guidance generally. We propose\na general implicit pipeline to rapidly compose NeRF objects. This new method\nenables the casting of dynamic shadows within or between objects using\nanalytical light sources while allowing multiple NeRF objects to be seamlessly\nplaced and rendered together with any arbitrary rigid transformations. Mainly,\nour work introduces a new surface representation known as Neural Depth Fields\n(NeDF) that quickly determines the spatial relationship between objects by\nallowing direct intersection computation between rays and implicit surfaces. It\nleverages an intersection neural network to query NeRF for acceleration instead\nof depending on an explicit spatial structure.Our proposed method is the first\nto enable both the progressive and interactive composition of NeRF objects.\nAdditionally, it also serves as a previewing plugin for a range of existing\nNeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v1.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.04663v1","updated":"2023-08-09T02:04:05Z","published":"2023-08-09T02:04:05Z","title":"Classification of lung cancer subtypes on CT images with synthetic\n  pathological priors","summary":"  The accurate diagnosis on pathological subtypes for lung cancer is of\nsignificant importance for the follow-up treatments and prognosis managements.\nIn this paper, we propose self-generating hybrid feature network (SGHF-Net) for\naccurately classifying lung cancer subtypes on computed tomography (CT) images.\nInspired by studies stating that cross-scale associations exist in the image\npatterns between the same case's CT images and its pathological images, we\ninnovatively developed a pathological feature synthetic module (PFSM), which\nquantitatively maps cross-modality associations through deep neural networks,\nto derive the \"gold standard\" information contained in the corresponding\npathological images from CT images. Additionally, we designed a radiological\nfeature extraction module (RFEM) to directly acquire CT image information and\nintegrated it with the pathological priors under an effective feature fusion\nframework, enabling the entire classification model to generate more indicative\nand specific pathologically related features and eventually output more\naccurate predictions. The superiority of the proposed model lies in its ability\nto self-generate hybrid features that contain multi-modality image information\nbased on a single-modality input. To evaluate the effectiveness, adaptability,\nand generalization ability of our model, we performed extensive experiments on\na large-scale multi-center dataset (i.e., 829 cases from three hospitals) to\ncompare our model and a series of state-of-the-art (SOTA) classification\nmodels. The experimental results demonstrated the superiority of our model for\nlung cancer subtypes classification with significant accuracy improvements in\nterms of accuracy (ACC), area under the curve (AUC), and F1 score.\n","authors":["Wentao Zhu","Yuan Jin","Gege Ma","Geng Chen","Jan Egger","Shaoting Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.04663v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.04657v1","updated":"2023-08-09T01:51:07Z","published":"2023-08-09T01:51:07Z","title":"Which Tokens to Use? Investigating Token Reduction in Vision\n  Transformers","summary":"  Since the introduction of the Vision Transformer (ViT), researchers have\nsought to make ViTs more efficient by removing redundant information in the\nprocessed tokens. While different methods have been explored to achieve this\ngoal, we still lack understanding of the resulting reduction patterns and how\nthose patterns differ across token reduction methods and datasets. To close\nthis gap, we set out to understand the reduction patterns of 10 different token\nreduction methods using four image classification datasets. By systematically\ncomparing these methods on the different classification tasks, we find that the\nTop-K pruning method is a surprisingly strong baseline. Through in-depth\nanalysis of the different methods, we determine that: the reduction patterns\nare generally not consistent when varying the capacity of the backbone model,\nthe reduction patterns of pruning-based methods significantly differ from fixed\nradial patterns, and the reduction patterns of pruning-based methods are\ncorrelated across classification datasets. Finally we report that the\nsimilarity of reduction patterns is a moderate-to-strong proxy for model\nperformance. Project page at https://vap.aau.dk/tokens.\n","authors":["Joakim Bruslund Haurum","Sergio Escalera","Graham W. Taylor","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2308.04657v1.pdf","comment":"ICCV 2023 NIVT Workshop. Project webpage https://vap.aau.dk/tokens"},{"id":"http://arxiv.org/abs/2308.04653v1","updated":"2023-08-09T01:38:58Z","published":"2023-08-09T01:38:58Z","title":"Assessing the performance of deep learning-based models for prostate\n  cancer segmentation using uncertainty scores","summary":"  This study focuses on comparing deep learning methods for the segmentation\nand quantification of uncertainty in prostate segmentation from MRI images. The\naim is to improve the workflow of prostate cancer detection and diagnosis.\nSeven different U-Net-based architectures, augmented with Monte-Carlo dropout,\nare evaluated for automatic segmentation of the central zone, peripheral zone,\ntransition zone, and tumor, with uncertainty estimation. The top-performing\nmodel in this study is the Attention R2U-Net, achieving a mean Intersection\nover Union (IoU) of 76.3% and Dice Similarity Coefficient (DSC) of 85% for\nsegmenting all zones. Additionally, Attention R2U-Net exhibits the lowest\nuncertainty values, particularly in the boundaries of the transition zone and\ntumor, when compared to the other models.\n","authors":["Pablo Cesar Quihui-Rubio","Daniel Flores-Araiza","Gilberto Ochoa-Ruiz","Miguel Gonzalez-Mendoza","Christian Mata"],"pdf_url":"https://arxiv.org/pdf/2308.04653v1.pdf","comment":"Article accepted at Cancer Prevention through early detecTion\n  (CaPtTion) workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.04643v1","updated":"2023-08-09T00:56:38Z","published":"2023-08-09T00:56:38Z","title":"Long-Distance Gesture Recognition using Dynamic Neural Networks","summary":"  Gestures form an important medium of communication between humans and\nmachines. An overwhelming majority of existing gesture recognition methods are\ntailored to a scenario where humans and machines are located very close to each\nother. This short-distance assumption does not hold true for several types of\ninteractions, for example gesture-based interactions with a floor cleaning\nrobot or with a drone. Methods made for short-distance recognition are unable\nto perform well on long-distance recognition due to gestures occupying only a\nsmall portion of the input data. Their performance is especially worse in\nresource constrained settings where they are not able to effectively focus\ntheir limited compute on the gesturing subject. We propose a novel, accurate\nand efficient method for the recognition of gestures from longer distances. It\nuses a dynamic neural network to select features from gesture-containing\nspatial regions of the input sensor data for further processing. This helps the\nnetwork focus on features important for gesture recognition while discarding\nbackground features early on, thus making it more compute efficient compared to\nother techniques. We demonstrate the performance of our method on the LD-ConGR\nlong-distance dataset where it outperforms previous state-of-the-art methods on\nrecognition accuracy and compute efficiency.\n","authors":["Shubhang Bhatnagar","Sharath Gopal","Narendra Ahuja","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2308.04643v1.pdf","comment":"Accepted to IEEE/RSJ International Conference on Intelligent Robots\n  and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2308.04638v1","updated":"2023-08-09T00:40:10Z","published":"2023-08-09T00:40:10Z","title":"GeoAdapt: Self-Supervised Test-Time Adaption in LiDAR Place Recognition\n  Using Geometric Priors","summary":"  LiDAR place recognition approaches based on deep learning suffer a\nsignificant degradation in performance when there is a shift between the\ndistribution of the training and testing datasets, with re-training often\nrequired to achieve top performance. However, obtaining accurate ground truth\non new environments can be prohibitively expensive, especially in complex or\nGPS-deprived environments. To address this issue we propose GeoAdapt, which\nintroduces a novel auxiliary classification head to generate pseudo-labels for\nre-training on unseen environments in a self-supervised manner. GeoAdapt uses\ngeometric consistency as a prior to improve the robustness of our generated\npseudo-labels against domain shift, improving the performance and reliability\nof our Test-Time Adaptation approach. Comprehensive experiments show that\nGeoAdapt significantly boosts place recognition performance across moderate to\nsevere domain shifts, and is competitive with fully supervised test-time\nadaptation approaches. Our code will be available at\nhttps://github.com/csiro-robotics/GeoAdapt.\n","authors":["Joshua Knights","Stephen Hausler","Sridha Sridharan","Clinton Fookes","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2308.04638v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2210.12126v3","updated":"2023-08-09T00:30:08Z","published":"2022-10-21T17:33:14Z","title":"One-Shot Neural Fields for 3D Object Understanding","summary":"  We present a unified and compact scene representation for robotics, where\neach object in the scene is depicted by a latent code capturing geometry and\nappearance. This representation can be decoded for various tasks such as novel\nview rendering, 3D reconstruction (e.g. recovering depth, point clouds, or\nvoxel maps), collision checking, and stable grasp prediction. We build our\nrepresentation from a single RGB input image at test time by leveraging recent\nadvances in Neural Radiance Fields (NeRF) that learn category-level priors on\nlarge multiview datasets, then fine-tune on novel objects from one or few\nviews. We expand the NeRF model for additional grasp outputs and explore ways\nto leverage this representation for robotics. At test-time, we build the\nrepresentation from a single RGB input image observing the scene from only one\nviewpoint. We find that the recovered representation allows rendering from\nnovel views, including of occluded object parts, and also for predicting\nsuccessful stable grasps. Grasp poses can be directly decoded from our latent\nrepresentation with an implicit grasp decoder. We experimented in both\nsimulation and real world and demonstrated the capability for robust robotic\ngrasping using such compact representation. Website:\nhttps://nerfgrasp.github.io\n","authors":["Valts Blukis","Taeyeop Lee","Jonathan Tremblay","Bowen Wen","In So Kweon","Kuk-Jin Yoon","Dieter Fox","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2210.12126v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition\n  Workshop (CVPRW) on XRNeRF: Advances in NeRF for the Metaverse 2023"},{"id":"http://arxiv.org/abs/2307.15644v2","updated":"2023-08-09T23:51:22Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05257v1","updated":"2023-08-09T23:36:03Z","published":"2023-08-09T23:36:03Z","title":"Advancing Early Detection of Virus Yellows: Developing a Hybrid\n  Convolutional Neural Network for Automatic Aphid Counting in Sugar Beet\n  Fields","summary":"  Aphids are efficient vectors to transmit virus yellows in sugar beet fields.\nTimely monitoring and control of their populations are thus critical to prevent\nthe large-scale outbreak of virus yellows. However, the manual counting of\naphids, which is the most common practice, is labor-intensive and\ntime-consuming. Additionally, two of the biggest challenges in aphid counting\nare that aphids are small objects and their density distributions are varied in\ndifferent areas of the field. To address these challenges, we proposed a hybrid\nautomatic aphid counting network architecture which integrates the detection\nnetwork and the density map estimation network. When the distribution density\nof aphids is low, it utilizes an improved Yolov5 to count aphids. Conversely,\nwhen the distribution density of aphids is high, its witches to CSRNet to count\naphids. To the best of our knowledge, this is the first framework integrating\nthe detection network and the density map estimation network for counting\ntasks. Through comparison experiments of counting aphids, it verified that our\nproposed approach outperforms all other methods in counting aphids. It achieved\nthe lowest MAE and RMSE values for both the standard and high-density aphid\ndatasets: 2.93 and 4.01 (standard), and 34.19 and 38.66 (high-density),\nrespectively. Moreover, the AP of the improved Yolov5 is 5% higher than that of\nthe original Yolov5. Especially for extremely small aphids and densely\ndistributed aphids, the detection performance of the improved Yolov5 is\nsignificantly better than the original Yolov5. This work provides an effective\nearly warning for the virus yellows risk caused by aphids in sugar beet fields,\noffering protection for sugar beet growth and ensuring sugar beet yield. The\ndatasets and project code are released at:\nhttps://github.com/JunfengGaolab/Counting-Aphids.\n","authors":["Xumin Gao","Wenxin Xue","Callum Lennox","Mark Stevens","Junfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.13341v3","updated":"2023-08-09T23:29:15Z","published":"2022-02-27T11:29:08Z","title":"Overlooked Implications of the Reconstruction Loss for VAE\n  Disentanglement","summary":"  Learning disentangled representations with variational autoencoders (VAEs) is\noften attributed to the regularisation component of the loss. In this work, we\nhighlight the interaction between data and the reconstruction term of the loss\nas the main contributor to disentanglement in VAEs. We show that standard\nbenchmark datasets have unintended correlations between their subjective\nground-truth factors and perceived axes in the data according to typical VAE\nreconstruction losses. Our work exploits this relationship to provide a theory\nfor what constitutes an adversarial dataset under a given reconstruction loss.\nWe verify this by constructing an example dataset that prevents disentanglement\nin state-of-the-art frameworks while maintaining human-intuitive ground-truth\nfactors. Finally, we re-enable disentanglement by designing an example\nreconstruction loss that is once again able to perceive the ground-truth\nfactors. Our findings demonstrate the subjective nature of disentanglement and\nthe importance of considering the interaction between the ground-truth factors,\ndata and notably, the reconstruction loss, which is under-recognised in the\nliterature.\n","authors":["Nathan Michlo","Richard Klein","Steven James"],"pdf_url":"https://arxiv.org/pdf/2202.13341v3.pdf","comment":"13 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.05242v1","updated":"2023-08-09T22:02:26Z","published":"2023-08-09T22:02:26Z","title":"Vector quantization loss analysis in VQGANs: a single-GPU ablation study\n  for image-to-image synthesis","summary":"  This study performs an ablation analysis of Vector Quantized Generative\nAdversarial Networks (VQGANs), concentrating on image-to-image synthesis\nutilizing a single NVIDIA A100 GPU. The current work explores the nuanced\neffects of varying critical parameters including the number of epochs, image\ncount, and attributes of codebook vectors and latent dimensions, specifically\nwithin the constraint of limited resources. Notably, our focus is pinpointed on\nthe vector quantization loss, keeping other hyperparameters and loss components\n(GAN loss) fixed. This was done to delve into a deeper understanding of the\ndiscrete latent space, and to explore how varying its size affects the\nreconstruction. Though, our results do not surpass the existing benchmarks,\nhowever, our findings shed significant light on VQGAN's behaviour for a smaller\ndataset, particularly concerning artifacts, codebook size optimization, and\ncomparative analysis with Principal Component Analysis (PCA). The study also\nuncovers the promising direction by introducing 2D positional encodings,\nrevealing a marked reduction in artifacts and insights into balancing clarity\nand overfitting.\n","authors":["Luv Verma","Varun Mohan"],"pdf_url":"https://arxiv.org/pdf/2308.05242v1.pdf","comment":"16 pages, 18 figures"},{"id":"http://arxiv.org/abs/2301.08365v5","updated":"2023-08-09T21:49:44Z","published":"2023-01-20T00:05:18Z","title":"On Retrospective k-space Subsampling schemes For Deep MRI Reconstruction","summary":"  Acquiring fully-sampled MRI $k$-space data is time-consuming, and collecting\naccelerated data can reduce the acquisition time. Employing 2D\nCartesian-rectilinear subsampling schemes is a conventional approach for\naccelerated acquisitions; however, this often results in imprecise\nreconstructions, even with the use of Deep Learning (DL), especially at high\nacceleration factors. Non-rectilinear or non-Cartesian trajectories can be\nimplemented in MRI scanners as alternative subsampling options. This work\ninvestigates the impact of the $k$-space subsampling scheme on the quality of\nreconstructed accelerated MRI measurements produced by trained DL models. The\nRecurrent Variational Network (RecurrentVarNet) was used as the DL-based\nMRI-reconstruction architecture. Cartesian, fully-sampled multi-coil $k$-space\nmeasurements from three datasets were retrospectively subsampled with different\naccelerations using eight distinct subsampling schemes: four\nCartesian-rectilinear, two Cartesian non-rectilinear, and two non-Cartesian.\nExperiments were conducted in two frameworks: scheme-specific, where a distinct\nmodel was trained and evaluated for each dataset-subsampling scheme pair, and\nmulti-scheme, where for each dataset a single model was trained on data\nrandomly subsampled by any of the eight schemes and evaluated on data\nsubsampled by all schemes. In both frameworks, RecurrentVarNets trained and\nevaluated on non-rectilinearly subsampled data demonstrated superior\nperformance, particularly for high accelerations. In the multi-scheme setting,\nreconstruction performance on rectilinearly subsampled data improved when\ncompared to the scheme-specific experiments. Our findings demonstrate the\npotential for using DL-based methods, trained on non-rectilinearly subsampled\nmeasurements, to optimize scan time and image quality.\n","authors":["George Yiasemis","Clara I. Sánchez","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2301.08365v5.pdf","comment":"22 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.05235v1","updated":"2023-08-09T21:39:57Z","published":"2023-08-09T21:39:57Z","title":"Spatial Gated Multi-Layer Perceptron for Land Use and Land Cover Mapping","summary":"  Convolutional Neural Networks (CNNs) are models that are utilized extensively\nfor the hierarchical extraction of features. Vision transformers (ViTs),\nthrough the use of a self-attention mechanism, have recently achieved superior\nmodeling of global contextual information compared to CNNs. However, to realize\ntheir image classification strength, ViTs require substantial training\ndatasets. Where the available training data are limited, current advanced\nmulti-layer perceptrons (MLPs) can provide viable alternatives to both deep\nCNNs and ViTs. In this paper, we developed the SGU-MLP, a learning algorithm\nthat effectively uses both MLPs and spatial gating units (SGUs) for precise\nland use land cover (LULC) mapping. Results illustrated the superiority of the\ndeveloped SGU-MLP classification algorithm over several CNN and CNN-ViT-based\nmodels, including HybridSN, ResNet, iFormer, EfficientFormer and CoAtNet. The\nproposed SGU-MLP algorithm was tested through three experiments in Houston,\nUSA, Berlin, Germany and Augsburg, Germany. The SGU-MLP classification model\nwas found to consistently outperform the benchmark CNN and CNN-ViT-based\nalgorithms. For example, for the Houston experiment, SGU-MLP significantly\noutperformed HybridSN, CoAtNet, Efficientformer, iFormer and ResNet by\napproximately 15%, 19%, 20%, 21%, and 25%, respectively, in terms of average\naccuracy. The code will be made publicly available at\nhttps://github.com/aj1365/SGUMLP\n","authors":["Ali Jamali","Swalpa Kumar Roy","Danfeng Hong","Peter M Atkinson","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2308.05235v1.pdf","comment":"Submitted in IEEE"},{"id":"http://arxiv.org/abs/2308.05234v1","updated":"2023-08-09T21:39:10Z","published":"2023-08-09T21:39:10Z","title":"Leveraging the Edge and Cloud for V2X-Based Real-Time Object Detection\n  in Autonomous Driving","summary":"  Environmental perception is a key element of autonomous driving because the\ninformation received from the perception module influences core driving\ndecisions. An outstanding challenge in real-time perception for autonomous\ndriving lies in finding the best trade-off between detection quality and\nlatency. Major constraints on both computation and power have to be taken into\naccount for real-time perception in autonomous vehicles. Larger object\ndetection models tend to produce the best results, but are also slower at\nruntime. Since the most accurate detectors cannot run in real-time locally, we\ninvestigate the possibility of offloading computation to edge and cloud\nplatforms, which are less resource-constrained. We create a synthetic dataset\nto train object detection models and evaluate different offloading strategies.\nUsing real hardware and network simulations, we compare different trade-offs\nbetween prediction quality and end-to-end delay. Since sending raw frames over\nthe network implies additional transmission delays, we also explore the use of\nJPEG and H.265 compression at varying qualities and measure their impact on\nprediction metrics. We show that models with adequate compression can be run in\nreal-time on the cloud while outperforming local detection performance.\n","authors":["Faisal Hawlader","François Robinet","Raphaël Frank"],"pdf_url":"https://arxiv.org/pdf/2308.05234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05232v1","updated":"2023-08-09T21:30:18Z","published":"2023-08-09T21:30:18Z","title":"SegMatch: A semi-supervised learning method for surgical instrument\n  segmentation","summary":"  Surgical instrument segmentation is recognised as a key enabler to provide\nadvanced surgical assistance and improve computer assisted interventions. In\nthis work, we propose SegMatch, a semi supervised learning method to reduce the\nneed for expensive annotation for laparoscopic and robotic surgical images.\nSegMatch builds on FixMatch, a widespread semi supervised classification\npipeline combining consistency regularization and pseudo labelling, and adapts\nit for the purpose of segmentation. In our proposed SegMatch, the unlabelled\nimages are weakly augmented and fed into the segmentation model to generate a\npseudo-label to enforce the unsupervised loss against the output of the model\nfor the adversarial augmented image on the pixels with a high confidence score.\nOur adaptation for segmentation tasks includes carefully considering the\nequivariance and invariance properties of the augmentation functions we rely\non. To increase the relevance of our augmentations, we depart from using only\nhandcrafted augmentations and introduce a trainable adversarial augmentation\nstrategy. Our algorithm was evaluated on the MICCAI Instrument Segmentation\nChallenge datasets Robust-MIS 2019 and EndoVis 2017. Our results demonstrate\nthat adding unlabelled data for training purposes allows us to surpass the\nperformance of fully supervised approaches which are limited by the\navailability of training data in these challenges. SegMatch also outperforms a\nrange of state-of-the-art semi-supervised learning semantic segmentation models\nin different labelled to unlabelled data ratios.\n","authors":["Meng Wei","Charlie Budd","Luis C. Garcia-Peraza-Herrera","Reuben Dorent","Miaojing Shi","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2308.05232v1.pdf","comment":"preprint under review, 12 pages, 7 figures"},{"id":"http://arxiv.org/abs/1901.01381v2","updated":"2023-08-09T21:05:56Z","published":"2019-01-05T08:23:48Z","title":"Brain segmentation based on multi-atlas guided 3D fully convolutional\n  network ensembles","summary":"  In this study, we proposed and validated a multi-atlas guided 3D fully\nconvolutional network (FCN) ensemble model (M-FCN) for segmenting brain regions\nof interest (ROIs) from structural magnetic resonance images (MRIs). One major\nlimitation of existing state-of-the-art 3D FCN segmentation models is that they\noften apply image patches of fixed size throughout training and testing, which\nmay miss some complex tissue appearance patterns of different brain ROIs. To\naddress this limitation, we trained a 3D FCN model for each ROI using patches\nof adaptive size and embedded outputs of the convolutional layers in the\ndeconvolutional layers to further capture the local and global context\npatterns. In addition, with an introduction of multi-atlas based guidance in\nM-FCN, our segmentation was generated by combining the information of images\nand labels, which is highly robust. To reduce over-fitting of the FCN model on\nthe training data, we adopted an ensemble strategy in the learning procedure.\nEvaluation was performed on two brain MRI datasets, aiming respectively at\nsegmenting 14 subcortical and ventricular structures and 54 brain ROIs. The\nsegmentation results of the proposed method were compared with those of a\nstate-of-the-art multi-atlas based segmentation method and an existing 3D FCN\nsegmentation model. Our results suggested that the proposed method had a\nsuperior segmentation performance.\n","authors":["Jiong Wu","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/1901.01381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08106v2","updated":"2023-08-09T20:53:14Z","published":"2023-07-16T17:14:39Z","title":"Polarization Multi-Image Synthesis with Birefringent Metasurfaces","summary":"  Optical metasurfaces composed of precisely engineered nanostructures have\ngained significant attention for their ability to manipulate light and\nimplement distinct functionalities based on the properties of the incident\nfield. Computational imaging systems have started harnessing this capability to\nproduce sets of coded measurements that benefit certain tasks when paired with\ndigital post-processing. Inspired by these works, we introduce a new system\nthat uses a birefringent metasurface with a polarizer-mosaicked photosensor to\ncapture four optically-coded measurements in a single exposure. We apply this\nsystem to the task of incoherent opto-electronic filtering, where digital\nspatial-filtering operations are replaced by simpler, per-pixel sums across the\nfour polarization channels, independent of the spatial filter size. In contrast\nto previous work on incoherent opto-electronic filtering that can realize only\none spatial filter, our approach can realize a continuous family of filters\nfrom a single capture, with filters being selected from the family by adjusting\nthe post-capture digital summation weights. To find a metasurface that can\nrealize a set of user-specified spatial filters, we introduce a form of\ngradient descent with a novel regularizer that encourages light efficiency and\na high signal-to-noise ratio. We demonstrate several examples in simulation and\nwith fabricated prototypes, including some with spatial filters that have\nprescribed variations with respect to depth and wavelength.\n  Visit the Project Page at\nhttps://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html\n","authors":["Dean Hazineh","Soon Wei Daniel Lim","Qi Guo","Federico Capasso","Todd Zickler"],"pdf_url":"https://arxiv.org/pdf/2307.08106v2.pdf","comment":"Published in the Proceedings of the 2023 IEEE International\n  Conference of Computational Photography"},{"id":"http://arxiv.org/abs/2308.05189v1","updated":"2023-08-09T18:49:21Z","published":"2023-08-09T18:49:21Z","title":"Hierarchical Representations for Spatio-Temporal Visual Attention\n  Modeling and Understanding","summary":"  This PhD. Thesis concerns the study and development of hierarchical\nrepresentations for spatio-temporal visual attention modeling and understanding\nin video sequences. More specifically, we propose two computational models for\nvisual attention. First, we present a generative probabilistic model for\ncontext-aware visual attention modeling and understanding. Secondly, we develop\na deep network architecture for visual attention modeling, which first\nestimates top-down spatio-temporal visual attention, and ultimately serves for\nmodeling attention in the temporal domain.\n","authors":["Miguel-Ángel Fernández-Torres"],"pdf_url":"https://arxiv.org/pdf/2308.05189v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2209.14408v2","updated":"2023-08-09T18:30:48Z","published":"2022-09-28T20:36:49Z","title":"RALACs: Action Recognition in Autonomous Vehicles using Interaction\n  Encoding and Optical Flow","summary":"  When applied to autonomous vehicle (AV) settings, action recognition can\nenhance an environment model's situational awareness. This is especially\nprevalent in scenarios where traditional geometric descriptions and heuristics\nin AVs are insufficient. However, action recognition has traditionally been\nstudied for humans, and its limited adaptability to noisy, un-clipped,\nun-pampered, raw RGB data has limited its application in other fields. To push\nfor the advancement and adoption of action recognition into AVs, this work\nproposes a novel two-stage action recognition system, termed RALACs. RALACs\nformulates the problem of action recognition for road scenes, and bridges the\ngap between it and the established field of human action recognition. This work\nshows how attention layers can be useful for encoding the relations across\nagents, and stresses how such a scheme can be class-agnostic. Furthermore, to\naddress the dynamic nature of agents on the road, RALACs constructs a novel\napproach to adapting Region of Interest (ROI) Alignment to agent tracks for\ndownstream action classification. Finally, our scheme also considers the\nproblem of active agent detection, and utilizes a novel application of fusing\noptical flow maps to discern relevant agents in a road scene. We show that our\nproposed scheme can outperform the baseline on the ICCV2021 Road Challenge\ndataset and by deploying it on a real vehicle platform, we provide preliminary\ninsight to the usefulness of action recognition in decision making.\n","authors":["Eddy Zhou","Alex Zhuang","Alikasim Budhwani","Rowan Dempster","Quanquan Li","Mohammad Al-Sharman","Derek Rayside","William Melek"],"pdf_url":"https://arxiv.org/pdf/2209.14408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05168v1","updated":"2023-08-09T18:11:28Z","published":"2023-08-09T18:11:28Z","title":"A Unified Interactive Model Evaluation for Classification, Object\n  Detection, and Instance Segmentation in Computer Vision","summary":"  Existing model evaluation tools mainly focus on evaluating classification\nmodels, leaving a gap in evaluating more complex models, such as object\ndetection. In this paper, we develop an open-source visual analysis tool,\nUni-Evaluator, to support a unified model evaluation for classification, object\ndetection, and instance segmentation in computer vision. The key idea behind\nour method is to formulate both discrete and continuous predictions in\ndifferent tasks as unified probability distributions. Based on these\ndistributions, we develop 1) a matrix-based visualization to provide an\noverview of model performance; 2) a table visualization to identify the\nproblematic data subsets where the model performs poorly; 3) a grid\nvisualization to display the samples of interest. These visualizations work\ntogether to facilitate the model evaluation from a global overview to\nindividual samples. Two case studies demonstrate the effectiveness of\nUni-Evaluator in evaluating model performance and making informed improvements.\n","authors":["Changjian Chen","Yukai Guo","Fengyuan Tian","Shilong Liu","Weikai Yang","Zhaowei Wang","Jing Wu","Hang Su","Hanspeter Pfister","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05168v1.pdf","comment":"Accepted to IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.05166v1","updated":"2023-08-09T18:10:05Z","published":"2023-08-09T18:10:05Z","title":"Deep Learning for Morphological Identification of Extended Radio\n  Galaxies using Weak Labels","summary":"  The present work discusses the use of a weakly-supervised deep learning\nalgorithm that reduces the cost of labelling pixel-level masks for complex\nradio galaxies with multiple components. The algorithm is trained on weak\nclass-level labels of radio galaxies to get class activation maps (CAMs). The\nCAMs are further refined using an inter-pixel relations network (IRNet) to get\ninstance segmentation masks over radio galaxies and the positions of their\ninfrared hosts. We use data from the Australian Square Kilometre Array\nPathfinder (ASKAP) telescope, specifically the Evolutionary Map of the Universe\n(EMU) Pilot Survey, which covered a sky area of 270 square degrees with an RMS\nsensitivity of 25-35 $\\mu$Jy/beam. We demonstrate that weakly-supervised deep\nlearning algorithms can achieve high accuracy in predicting pixel-level\ninformation, including masks for the extended radio emission encapsulating all\ngalaxy components and the positions of the infrared host galaxies. We evaluate\nthe performance of our method using mean Average Precision (mAP) across\nmultiple classes at a standard intersection over union (IoU) threshold of 0.5.\nWe show that the model achieves a mAP$_{50}$ of 67.5\\% and 76.8\\% for radio\nmasks and infrared host positions, respectively. The network architecture can\nbe found at the following link: https://github.com/Nikhel1/Gal-CAM\n","authors":["Nikhel Gupta","Zeeshan Hayder","Ray P. Norris","Minh Huynh","Lars Petersson","X. Rosalind Wang","Heinz Andernach","Bärbel S. Koribalski","Miranda Yew","Evan J. Crawford"],"pdf_url":"https://arxiv.org/pdf/2308.05166v1.pdf","comment":"14 pages, 6 figues, accepted for publication in PASA"},{"id":"http://arxiv.org/abs/2303.17005v2","updated":"2023-08-09T18:00:34Z","published":"2023-03-29T20:16:39Z","title":"Tightly-coupled Visual-DVL-Inertial Odometry for Robot-based Ice-water\n  Boundary Exploration","summary":"  Robotic underwater systems, e.g., Autonomous Underwater Vehicles (AUVs) and\nRemotely Operated Vehicles (ROVs), are promising tools for collecting\nbiogeochemical data at the ice-water interface for scientific advancements.\nHowever, state estimation, i.e., localization, is a well-known problem for\nrobotic systems, especially, for the ones that travel underwater. In this\npaper, we present a tightly-coupled multi-sensors fusion framework to increase\nlocalization accuracy that is robust to sensor failure. Visual images, Doppler\nVelocity Log (DVL), Inertial Measurement Unit (IMU) and Pressure sensor are\nintegrated into the state-of-art Multi-State Constraint Kalman Filter (MSCKF)\nfor state estimation. Besides that a new keyframe-based state clone mechanism\nand a new DVL-aided feature enhancement are presented to further improve the\nlocalization performance. The proposed method is validated with a data set\ncollected in the field under frozen ice, and the result is compared with 6\nother different sensor fusion setups. Overall, the result with the keyframe\nenabled and DVL-aided feature enhancement yields the best performance with a\nRoot-mean-square error of less than 2 m compared to the ground truth path with\na total traveling distance of about 200 m.\n","authors":["Lin Zhao","Mingxi Zhou","Brice Loose"],"pdf_url":"https://arxiv.org/pdf/2303.17005v2.pdf","comment":"accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2304.03246v2","updated":"2023-08-09T17:28:21Z","published":"2023-04-06T17:29:50Z","title":"Inst-Inpaint: Instructing to Remove Objects with Diffusion Models","summary":"  Image inpainting task refers to erasing unwanted pixels from images and\nfilling them in a semantically consistent and realistic way. Traditionally, the\npixels that are wished to be erased are defined with binary masks. From the\napplication point of view, a user needs to generate the masks for the objects\nthey would like to remove which can be time-consuming and prone to errors. In\nthis work, we are interested in an image inpainting algorithm that estimates\nwhich object to be removed based on natural language input and removes it,\nsimultaneously. For this purpose, first, we construct a dataset named\nGQA-Inpaint for this task. Second, we present a novel inpainting framework,\nInst-Inpaint, that can remove objects from images based on the instructions\ngiven as text prompts. We set various GAN and diffusion-based baselines and run\nexperiments on synthetic and real image datasets. We compare methods with\ndifferent evaluation metrics that measure the quality and accuracy of the\nmodels and show significant quantitative and qualitative improvements.\n","authors":["Ahmet Burak Yildirim","Vedat Baday","Erkut Erdem","Aykut Erdem","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2304.03246v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05140v1","updated":"2023-08-09T15:32:03Z","published":"2023-08-09T15:32:03Z","title":"Robust Object Modeling for Visual Tracking","summary":"  Object modeling has become a core part of recent tracking frameworks. Current\npopular tackers use Transformer attention to extract the template feature\nseparately or interactively with the search region. However, separate template\nlearning lacks communication between the template and search regions, which\nbrings difficulty in extracting discriminative target-oriented features. On the\nother hand, interactive template learning produces hybrid template features,\nwhich may introduce potential distractors to the template via the cluttered\nsearch regions. To enjoy the merits of both methods, we propose a robust object\nmodeling framework for visual tracking (ROMTrack), which simultaneously models\nthe inherent template and the hybrid template features. As a result, harmful\ndistractors can be suppressed by combining the inherent features of target\nobjects with search regions' guidance. Target-related features can also be\nextracted using the hybrid template, thus resulting in a more robust object\nmodeling framework. To further enhance robustness, we present novel variation\ntokens to depict the ever-changing appearance of target objects. Variation\ntokens are adaptable to object deformation and appearance variations, which can\nboost overall performance with negligible computation. Experiments show that\nour ROMTrack sets a new state-of-the-art on multiple benchmarks.\n","authors":["Yidong Cai","Jie Liu","Jie Tang","Gangshan Wu"],"pdf_url":"https://arxiv.org/pdf/2308.05140v1.pdf","comment":"Accepted by ICCV2023. 19 pages. Code is available at\n  https://github.com/dawnyc/ROMTrack"},{"id":"http://arxiv.org/abs/2308.05137v1","updated":"2023-08-09T15:04:17Z","published":"2023-08-09T15:04:17Z","title":"Discrepancy-based Active Learning for Weakly Supervised Bleeding\n  Segmentation in Wireless Capsule Endoscopy Images","summary":"  Weakly supervised methods, such as class activation maps (CAM) based, have\nbeen applied to achieve bleeding segmentation with low annotation efforts in\nWireless Capsule Endoscopy (WCE) images. However, the CAM labels tend to be\nextremely noisy, and there is an irreparable gap between CAM labels and ground\ntruths for medical images. This paper proposes a new Discrepancy-basEd Active\nLearning (DEAL) approach to bridge the gap between CAMs and ground truths with\na few annotations. Specifically, to liberate labor, we design a novel\ndiscrepancy decoder model and a CAMPUS (CAM, Pseudo-label and groUnd-truth\nSelection) criterion to replace the noisy CAMs with accurate model predictions\nand a few human labels. The discrepancy decoder model is trained with a unique\nscheme to generate standard, coarse and fine predictions. And the CAMPUS\ncriterion is proposed to predict the gaps between CAMs and ground truths based\non model divergence and CAM divergence. We evaluate our method on the WCE\ndataset and results show that our method outperforms the state-of-the-art\nactive learning methods and reaches comparable performance to those trained\nwith full annotated datasets with only 10% of the training data labeled.\n","authors":["Fan Bai","Xiaohan Xing","Yutian Shen","Han Ma","Max Q. -H. Meng"],"pdf_url":"https://arxiv.org/pdf/2308.05137v1.pdf","comment":"accepted by MICCAI 2022"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.05013v1","updated":"2023-08-09T15:11:46Z","published":"2023-08-09T15:11:46Z","title":"Dual Intents Graph Modeling for User-centric Group Discovery","summary":"  Online groups have become increasingly prevalent, providing users with space\nto share experiences and explore interests. Therefore, user-centric group\ndiscovery task, i.e., recommending groups to users can help both users' online\nexperiences and platforms' long-term developments. Existing recommender methods\ncan not deal with this task as modeling user-group participation into a\nbipartite graph overlooks their item-side interests. Although there exist a few\nworks attempting to address this task, they still fall short in fully\npreserving the social context and ensuring effective interest representation\nlearning.\n  In this paper, we focus on exploring the intents that motivate users to\nparticipate in groups, which can be categorized into different types, like the\nsocial-intent and the personal interest-intent. The former refers to users\njoining a group affected by their social links, while the latter relates to\nusers joining groups with like-minded people for self-enjoyment. To comprehend\ndifferent intents, we propose a novel model, DiRec, that first models each\nintent separately and then fuses them together for predictions. Specifically,\nfor social-intent, we introduce the hypergraph structure to model the\nrelationship between groups and members, leading to a richer understanding of\nthe social context. As for interest-intent, we employ novel structural\nrefinement on the interactive graph to uncover more intricate user behaviors\nand group interests, realizing better representation learning of interests.\nFurthermore, we also observe the intent overlapping in real-world scenarios and\ndevise a novel self-supervised learning loss that encourages such alignment for\nfinal recommendations. Extensive experiments on three public datasets show the\nsignificant improvement of DiRec over the state-of-the-art methods.\n","authors":["Xixi Wu","Yun Xiong","Yao Zhang","Yizhu Jiao","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05013v1.pdf","comment":"Accepted by CIKM'23 as Long Paper"},{"id":"http://arxiv.org/abs/2308.04913v1","updated":"2023-08-09T12:26:37Z","published":"2023-08-09T12:26:37Z","title":"LLaMA-E: Empowering E-commerce Authoring with Multi-Aspect Instruction\n  Following","summary":"  E-commerce authoring involves creating attractive, abundant, and targeted\npromotional content to drive product sales. The emergence of large language\nmodels (LLMs) introduces an innovative paradigm, offering a unified solution to\naddress various authoring tasks within this scenario. However, mainstream LLMs\ntrained on general corpora with common sense knowledge reveal limitations in\nfitting complex and personalized features unique to e-commerce products and\ncustomers. Furthermore, LLMs like GPT-3.5 necessitate remote accessibility,\nraising concerns about safeguarding voluminous customer privacy data during\ntransmission. This paper proposes the LLaMA-E, the unified and customized\ninstruction-following language models focusing on diverse e-commerce authoring\ntasks. Specifically, the domain experts create the seed instruction set from\nthe tasks of ads generation, query-enhanced product title rewriting, product\nclassification, purchase intent speculation, and general Q&A. These tasks\nenable the models to comprehensively understand precise e-commerce authoring\nknowledge by interleaving features covering typical service aspects of\ncustomers, sellers, and platforms. The GPT-3.5 is introduced as a teacher\nmodel, which expands the seed instructions to form a training set for the\nLLaMA-E models with various scales. The experimental results show that the\nproposed LLaMA-E models achieve state-of-the-art results in quantitative and\nqualitative evaluations, also exhibiting the advantage in zero-shot scenes. To\nthe best of our knowledge, this study is the first to serve the LLMs to\nspecific e-commerce authoring scenarios.\n","authors":["Kaize Shi","Xueyao Sun","Dingxian Wang","Yinlin Fu","Guandong Xu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2308.04913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.06668v3","updated":"2023-08-09T12:17:21Z","published":"2021-12-13T13:42:35Z","title":"CT4Rec: Simple yet Effective Consistency Training for Sequential\n  Recommendation","summary":"  Sequential recommendation methods are increasingly important in cutting-edge\nrecommender systems. Through leveraging historical records, the systems can\ncapture user interests and perform recommendations accordingly.\nState-of-the-art sequential recommendation models proposed very recently\ncombine contrastive learning techniques for obtaining high-quality user\nrepresentations. Though effective and performing well, the models based on\ncontrastive learning require careful selection of data augmentation methods and\npretext tasks, efficient negative sampling strategies, and massive\nhyper-parameters validation. In this paper, we propose an ultra-simple\nalternative for obtaining better user representations and improving sequential\nrecommendation performance. Specifically, we present a simple yet effective\n\\textbf{C}onsistency \\textbf{T}raining method for sequential\n\\textbf{Rec}ommendation (CT4Rec) in which only two extra training objectives\nare utilized without any structural modifications and data augmentation.\nExperiments on three benchmark datasets and one large newly crawled industrial\ncorpus demonstrate that our proposed method outperforms SOTA models by a large\nmargin and with much less training time than these based on contrastive\nlearning. Online evaluation on real-world content recommendation system also\nachieves 2.717\\% improvement on the click-through rate and 3.679\\% increase on\nthe average click number per capita. Further exploration reveals that such a\nsimple method has great potential for CTR prediction. Our code is available at\n\\url{https://github.com/ct4rec/CT4Rec.git}.\n","authors":["Chong Liu","Xiaoyang Liu","Rongqin Zheng","Lixin Zhang","Xiaobo Liang","Juntao Li","Lijun Wu","Min Zhang","Leyu Lin"],"pdf_url":"https://arxiv.org/pdf/2112.06668v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03443v2","updated":"2023-08-09T10:34:54Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu","Laura Forastiere"],"pdf_url":"https://arxiv.org/pdf/2308.03443v2.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.04807v1","updated":"2023-08-09T09:01:37Z","published":"2023-08-09T09:01:37Z","title":"Parallel Knowledge Enhancement based Framework for Multi-behavior\n  Recommendation","summary":"  Multi-behavior recommendation algorithms aim to leverage the multiplex\ninteractions between users and items to learn users' latent preferences. Recent\nmulti-behavior recommendation frameworks contain two steps: fusion and\nprediction. In the fusion step, advanced neural networks are used to model the\nhierarchical correlations between user behaviors. In the prediction step,\nmultiple signals are utilized to jointly optimize the model with a multi-task\nlearning (MTL) paradigm. However, recent approaches have not addressed the\nissue caused by imbalanced data distribution in the fusion step, resulting in\nthe learned relationships being dominated by high-frequency behaviors. In the\nprediction step, the existing methods use a gate mechanism to directly\naggregate expert information generated by coupling input, leading to negative\ninformation transfer. To tackle these issues, we propose a Parallel Knowledge\nEnhancement Framework (PKEF) for multi-behavior recommendation. Specifically,\nwe enhance the hierarchical information propagation in the fusion step using\nparallel knowledge (PKF). Meanwhile, in the prediction step, we decouple the\nrepresentations to generate expert information and introduce a projection\nmechanism during aggregation to eliminate gradient conflicts and alleviate\nnegative transfer (PME). We conduct comprehensive experiments on three\nreal-world datasets to validate the effectiveness of our model. The results\nfurther demonstrate the rationality and effectiveness of the designed PKF and\nPME modules. The source code and datasets are available at\nhttps://github.com/MC-CV/PKEF.\n","authors":["Chang Meng","Chenhao Zhai","Yu Yang","Hengyu Zhang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2308.04807v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.04805v1","updated":"2023-08-09T08:59:48Z","published":"2023-08-09T08:59:48Z","title":"DiVa: An Iterative Framework to Harvest More Diverse and Valid Labels\n  from User Comments for Music","summary":"  Towards sufficient music searching, it is vital to form a complete set of\nlabels for each song. However, current solutions fail to resolve it as they\ncannot produce diverse enough mappings to make up for the information missed by\nthe gold labels. Based on the observation that such missing information may\nalready be presented in user comments, we propose to study the automated music\nlabeling in an essential but under-explored setting, where the model is\nrequired to harvest more diverse and valid labels from the users' comments\ngiven limited gold labels. To this end, we design an iterative framework (DiVa)\nto harvest more $\\underline{\\text{Di}}$verse and $\\underline{\\text{Va}}$lid\nlabels from user comments for music. The framework makes a classifier able to\nform complete sets of labels for songs via pseudo-labels inferred from\npre-trained classifiers and a novel joint score function. The experiment on a\ndensely annotated testing set reveals the superiority of the Diva over\nstate-of-the-art solutions in producing more diverse labels missed by the gold\nlabels. We hope our work can inspire future research on automated music\nlabeling.\n","authors":["Hongru Liang","Jingyao Liu","Yuanxin Xiang","Jiachen Du","Lanjun Zhou","Shushen Pan","Wenqiang Lei"],"pdf_url":"https://arxiv.org/pdf/2308.04805v1.pdf","comment":"11 pages, 5 figures, published to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04768v1","updated":"2023-08-09T07:56:36Z","published":"2023-08-09T07:56:36Z","title":"Entire Space Cascade Delayed Feedback Modeling for Effective Conversion\n  Rate Prediction","summary":"  Conversion rate (CVR) prediction is an essential task for large-scale\ne-commerce platforms. However, refund behaviors frequently occur after\nconversion in online shopping systems, which drives us to pay attention to\neffective conversion for building healthier shopping services. This paper\ndefines the probability of item purchasing without any subsequent refund as an\neffective conversion rate (ECVR). A simple paradigm for ECVR prediction is to\ndecompose it into two sub-tasks: CVR prediction and post-conversion refund rate\n(RFR) prediction. However, RFR prediction suffers from data sparsity (DS) and\nsample selection bias (SSB) issues, as the refund behaviors are only available\nafter user purchase. Furthermore, there is delayed feedback in both conversion\nand refund events and they are sequentially dependent, named cascade delayed\nfeedback (CDF), which significantly harms data freshness for model training.\nPrevious studies mainly focus on tackling DS and SSB or delayed feedback for a\nsingle event. To jointly tackle these issues in ECVR prediction, we propose an\nEntire space CAscade Delayed feedback modeling (ECAD) method. Specifically,\nECAD deals with DS and SSB by constructing two tasks including CVR prediction\nand conversion \\& refund rate (CVRFR) prediction using the entire space\nmodeling framework. In addition, it carefully schedules auxiliary tasks to\nleverage both conversion and refund time within data to alleviate CDF.\nExperimental results on the offline industrial dataset and online A/B testing\ndemonstrate the effectiveness of ECAD. In addition, ECAD has been deployed in\none of the recommender systems in Alibaba, contributing to a significant\nimprovement of ECVR.\n","authors":["Yunfeng Zhao","Xu Yan","Xiaoqiang Gui","Shuguang Han","Xiang-Rong Sheng","Guoxian Yu","Jufeng Chen","Zhao Xu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.04768v1.pdf","comment":"Accepted to CIKM'23"},{"id":"http://arxiv.org/abs/2308.04756v1","updated":"2023-08-09T07:47:17Z","published":"2023-08-09T07:47:17Z","title":"Building Interpretable and Reliable Open Information Retriever for New\n  Domains Overnight","summary":"  Information retrieval (IR) or knowledge retrieval, is a critical component\nfor many down-stream tasks such as open-domain question answering (QA). It is\nalso very challenging, as it requires succinctness, completeness, and\ncorrectness. In recent works, dense retrieval models have achieved\nstate-of-the-art (SOTA) performance on in-domain IR and QA benchmarks by\nrepresenting queries and knowledge passages with dense vectors and learning the\nlexical and semantic similarity. However, using single dense vectors and\nend-to-end supervision are not always optimal because queries may require\nattention to multiple aspects and event implicit knowledge. In this work, we\npropose an information retrieval pipeline that uses entity/event linking model\nand query decomposition model to focus more accurately on different information\nunits of the query. We show that, while being more interpretable and reliable,\nour proposed pipeline significantly improves passage coverages and denotation\naccuracies across five IR and QA benchmarks. It will be the go-to system to use\nfor applications that need to perform IR on a new domain without much dedicated\neffort, because of its superior interpretability and cross-domain performance.\n","authors":["Xiaodong Yu","Ben Zhou","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04756v1.pdf","comment":"Submission of ACL 2023. Rejected"},{"id":"http://arxiv.org/abs/2308.04725v1","updated":"2023-08-09T06:03:07Z","published":"2023-08-09T06:03:07Z","title":"Self-supervised Learning of Rotation-invariant 3D Point Set Features\n  using Transformer and its Self-distillation","summary":"  Invariance against rotations of 3D objects is an important property in\nanalyzing 3D point set data. Conventional 3D point set DNNs having rotation\ninvariance typically obtain accurate 3D shape features via supervised learning\nby using labeled 3D point sets as training samples. However, due to the rapid\nincrease in 3D point set data and the high cost of labeling, a framework to\nlearn rotation-invariant 3D shape features from numerous unlabeled 3D point\nsets is required. This paper proposes a novel self-supervised learning\nframework for acquiring accurate and rotation-invariant 3D point set features\nat object-level. Our proposed lightweight DNN architecture decomposes an input\n3D point set into multiple global-scale regions, called tokens, that preserve\nthe spatial layout of partial shapes composing the 3D object. We employ a\nself-attention mechanism to refine the tokens and aggregate them into an\nexpressive rotation-invariant feature per 3D point set. Our DNN is effectively\ntrained by using pseudo-labels generated by a self-distillation framework. To\nfacilitate the learning of accurate features, we propose to combine multi-crop\nand cut-mix data augmentation techniques to diversify 3D point sets for\ntraining. Through a comprehensive evaluation, we empirically demonstrate that,\n(1) existing rotation-invariant DNN architectures designed for supervised\nlearning do not necessarily learn accurate 3D shape features under a\nself-supervised learning scenario, and (2) our proposed algorithm learns\nrotation-invariant 3D point set features that are more accurate than those\nlearned by existing algorithms. Code will be available at\nhttps://github.com/takahikof/RIPT_SDMM\n","authors":["Takahiko Furuya","Zhoujie Chen","Ryutarou Ohbuchi","Zhenzhong Kuang"],"pdf_url":"https://arxiv.org/pdf/2308.04725v1.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2308.04706v1","updated":"2023-08-09T04:57:56Z","published":"2023-08-09T04:57:56Z","title":"Pareto Invariant Representation Learning for Multimedia Recommendation","summary":"  Multimedia recommendation involves personalized ranking tasks, where\nmultimedia content is usually represented using a generic encoder. However,\nthese generic representations introduce spurious correlations that fail to\nreveal users' true preferences. Existing works attempt to alleviate this\nproblem by learning invariant representations, but overlook the balance between\nindependent and identically distributed (IID) and out-of-distribution (OOD)\ngeneralization. In this paper, we propose a framework called Pareto Invariant\nRepresentation Learning (PaInvRL) to mitigate the impact of spurious\ncorrelations from an IID-OOD multi-objective optimization perspective, by\nlearning invariant representations (intrinsic factors that attract user\nattention) and variant representations (other factors) simultaneously.\nSpecifically, PaInvRL includes three iteratively executed modules: (i)\nheterogeneous identification module, which identifies the heterogeneous\nenvironments to reflect distributional shifts for user-item interactions; (ii)\ninvariant mask generation module, which learns invariant masks based on the\nPareto-optimal solutions that minimize the adaptive weighted Invariant Risk\nMinimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which\ngenerates both variant representations and item-invariant representations for\ntraining a multi-modal recommendation model that mitigates spurious\ncorrelations and balances the generalization performance within and cross the\nenvironmental distributions. We compare the proposed PaInvRL with\nstate-of-the-art recommendation models on three public multimedia\nrecommendation datasets (Movielens, Tiktok, and Kwai), and the experimental\nresults validate the effectiveness of PaInvRL for both within- and\ncross-environmental learning.\n","authors":["Shanshan Huang","Haoxuan Li","Qingsong Li","Chunyuan Zheng","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04706v1.pdf","comment":"ACM MM 2023 full paper"},{"id":"http://arxiv.org/abs/2308.04693v1","updated":"2023-08-09T04:06:24Z","published":"2023-08-09T04:06:24Z","title":"Evaluating and Optimizing the Effectiveness of Neural Machine\n  Translation in Supporting Code Retrieval Models: A Study on the CAT Benchmark","summary":"  Neural Machine Translation (NMT) is widely applied in software engineering\ntasks. The effectiveness of NMT for code retrieval relies on the ability to\nlearn from the sequence of tokens in the source language to the sequence of\ntokens in the target language. While NMT performs well in pseudocode-to-code\ntranslation, it might have challenges in learning to translate from natural\nlanguage query to source code in newly curated real-world code documentation/\nimplementation datasets. In this work, we analyze the performance of NMT in\nnatural language-to-code translation in the newly curated CAT benchmark that\nincludes the optimized versions of three Java datasets TLCodeSum,\nCodeSearchNet, Funcom, and a Python dataset PCSD. Our evaluation shows that NMT\nhas low accuracy, measured by CrystalBLEU and Meteor metrics in this task. To\nalleviate the duty of NMT in learning complex representation of source code, we\npropose ASTTrans Representation, a tailored representation of an Abstract\nSyntax Tree (AST) using a subset of non-terminal nodes. We show that the\nclassical approach NMT performs significantly better in learning ASTTrans\nRepresentation over code tokens with up to 36% improvement on Meteor score.\nMoreover, we leverage ASTTrans Representation to conduct combined code search\nprocesses from the state-of-the-art code search processes using GraphCodeBERT\nand UniXcoder. Our NMT models of learning ASTTrans Representation can boost the\nMean Reciprocal Rank of these state-of-the-art code search processes by up to\n3.08% and improve 23.08% of queries' results over the CAT benchmark.\n","authors":["Hung Phan","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2308.04693v1.pdf","comment":"Accepted as Full Paper in Proceedings of the 32nd ACM International\n  Conference on Information and Knowledge Management (CIKM), Birmingham, UK,\n  October 2023"},{"id":"http://arxiv.org/abs/2308.04689v1","updated":"2023-08-09T03:52:48Z","published":"2023-08-09T03:52:48Z","title":"web crawler strategies for web pages under robot.txt restriction","summary":"  In the present time, all know about World Wide Web and work over the Internet\ndaily. In this paper, we introduce the search engines working for keywords that\nare entered by users to find something. The search engine uses different search\nalgorithms for convenient results for providing to the net surfer. Net surfers\ngo with the top search results but how did the results of web pages get higher\nranks over search engines? how the search engine got that all the web pages in\nthe database? This paper gives the answers to all these kinds of basic\nquestions. Web crawlers working for search engines and robot exclusion protocol\nrules for web crawlers are also addressed in this research paper. Webmaster\nuses different restriction facts in robot.txt file to instruct web crawler,\nsome basic formats of robot.txt are also mentioned in this paper.\n","authors":["Piyush Vyas","Akhilesh Chauhan","Tushar Mandge","Surbhi Hardikar"],"pdf_url":"https://arxiv.org/pdf/2308.04689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03855v2","updated":"2023-08-09T02:36:50Z","published":"2023-08-07T18:06:46Z","title":"Mobile Supply: The Last Piece of Jigsaw of Recommender System","summary":"  Recommendation system is a fundamental functionality of online platforms.\nWith the development of computing power of mobile phones, some researchers have\ndeployed recommendation algorithms on users' mobile devices to address the\nproblems of data transmission delay and pagination trigger mechanism. However,\nthe existing edge-side mobile rankings cannot completely solve the problem of\npagination trigger mechanism. The mobile ranking can only sort the items on the\ncurrent page, and the fixed set of candidate items limits the performance of\nthe mobile ranking. Besides, after the user has viewed the items of interest to\nthe user on the current page, the user refresh to get a new page of items. This\nwill affect the user's immersive experience because the user is not satisfied\nwith the left items on the current page. In order to address the problem of\npagination trigger mechanism, we propose a completely new module in the\npipeline of recommender system named Mobile Supply. The pipeline of recommender\nsystem is extended to \"retrival->pre-ranking->ranking->re-ranking->Mobile\nSupply->mobile ranking\". Specifically, we introduce the concept of list value\nand use point-wise paradigm to approximate list-wise estimation to calculate\nthe maximum revenue that can be achieved by mobile ranking for the current\npage. We also design a new mobile ranking approach named device-aware mobile\nranking considering the differences of mobile devices tailored to the new\npipeline. Extensive offline and online experiments show the superiority of our\nproposed method and prove that Mobile Supply can further improve the\nperformance of edge-side recommender system and user experience. Mobile Supply\nhas been deployed on the homepage of a large-scale online food platform and has\nyielded considerable profits in our business.\n","authors":["Zhenhao Jiang","Biao Zeng","Hao Feng","Jin Liu","Jie Zhang","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.03855v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04661v1","updated":"2023-08-09T01:58:28Z","published":"2023-08-09T01:58:28Z","title":"Unified Matrix Factorization with Dynamic Multi-view Clustering","summary":"  Matrix factorization (MF) is a classical collaborative filtering algorithm\nfor recommender systems. It decomposes the user-item interaction matrix into a\nproduct of low-dimensional user representation matrix and item representation\nmatrix. In typical recommendation scenarios, the user-item interaction paradigm\nis usually a two-stage process and requires static clustering analysis of the\nobtained user and item representations. The above process, however, is time and\ncomputationally intensive, making it difficult to apply in real-time to\ne-commerce or Internet of Things environments with billions of users and\ntrillions of items. To address this, we propose a unified matrix factorization\nmethod based on dynamic multi-view clustering (MFDMC) that employs an\nend-to-end training paradigm. Specifically, in each view, a user/item\nrepresentation is regarded as a weighted projection of all clusters. The\nrepresentation of each cluster is learnable, enabling the dynamic discarding of\nbad clusters. Furthermore, we employ multi-view clustering to represent\nmultiple roles of users/items, effectively utilizing the representation space\nand improving the interpretability of the user/item representations for\ndownstream tasks. Extensive experiments show that our proposed MFDMC achieves\nstate-of-the-art performance on real-world recommendation datasets.\nAdditionally, comprehensive visualization and ablation studies interpretably\nconfirm that our method provides meaningful representations for downstream\ntasks of users/items.\n","authors":["Shangde Gao","Ke Liu","Yichao Fu"],"pdf_url":"https://arxiv.org/pdf/2308.04661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04033v2","updated":"2023-08-09T20:34:51Z","published":"2023-08-08T04:21:14Z","title":"Adapting Foundation Models for Information Synthesis of Wireless\n  Communication Specifications","summary":"  Existing approaches to understanding, developing and researching modern\nwireless communication technologies involves time-intensive and arduous process\nof sifting through numerous webpages and technical specification documents,\ngathering the required information and synthesizing it. This paper presents\nNextGen Communications Copilot, a conversational artificial intelligence tool\nfor information synthesis of wireless communication specifications. The system\nbuilds on top of recent advancements in foundation models and consists of three\nkey additional components: a domain-specific database, a context extractor, and\na feedback mechanism. The system appends user queries with concise and\nquery-dependent contextual information extracted from a database of wireless\ntechnical specifications and incorporates tools for expert feedback and data\ncontributions. On evaluation using a benchmark dataset of queries and reference\nresponses created by subject matter experts, the system demonstrated more\nrelevant and accurate answers with an average BLEU score and BERTScore\nF1-measure of 0.37 and 0.79 respectively compared to the corresponding values\nof 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.\n","authors":["Manikanta Kotaru"],"pdf_url":"https://arxiv.org/pdf/2308.04033v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.05103v1","updated":"2023-08-09T17:54:56Z","published":"2023-08-09T17:54:56Z","title":"Improved Multi-Shot Diffusion-Weighted MRI with Zero-Shot\n  Self-Supervised Learning Reconstruction","summary":"  Diffusion MRI is commonly performed using echo-planar imaging (EPI) due to\nits rapid acquisition time. However, the resolution of diffusion-weighted\nimages is often limited by magnetic field inhomogeneity-related artifacts and\nblurring induced by T2- and T2*-relaxation effects. To address these\nlimitations, multi-shot EPI (msEPI) combined with parallel imaging techniques\nis frequently employed. Nevertheless, reconstructing msEPI can be challenging\ndue to phase variation between multiple shots. In this study, we introduce a\nnovel msEPI reconstruction approach called zero-MIRID (zero-shot\nself-supervised learning of Multi-shot Image Reconstruction for Improved\nDiffusion MRI). This method jointly reconstructs msEPI data by incorporating\ndeep learning-based image regularization techniques. The network incorporates\nCNN denoisers in both k- and image-spaces, while leveraging virtual coils to\nenhance image reconstruction conditioning. By employing a self-supervised\nlearning technique and dividing sampled data into three groups, the proposed\napproach achieves superior results compared to the state-of-the-art parallel\nimaging method, as demonstrated in an in-vivo experiment.\n","authors":["Jaejin Cho","Yohan Jun","Xiaoqing Wang","Caique Kobayashi","Berkin Bilgic"],"pdf_url":"https://arxiv.org/pdf/2308.05103v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.05101v1","updated":"2023-08-09T17:53:36Z","published":"2023-08-09T17:53:36Z","title":"DOST -- Domain Obedient Self-supervised Training for Multi Label\n  Classification with Noisy Labels","summary":"  The enormous demand for annotated data brought forth by deep learning\ntechniques has been accompanied by the problem of annotation noise. Although\nthis issue has been widely discussed in machine learning literature, it has\nbeen relatively unexplored in the context of \"multi-label classification\" (MLC)\ntasks which feature more complicated kinds of noise. Additionally, when the\ndomain in question has certain logical constraints, noisy annotations often\nexacerbate their violations, making such a system unacceptable to an expert.\nThis paper studies the effect of label noise on domain rule violation incidents\nin the MLC task, and incorporates domain rules into our learning algorithm to\nmitigate the effect of noise. We propose the Domain Obedient Self-supervised\nTraining (DOST) paradigm which not only makes deep learning models more aligned\nto domain rules, but also improves learning performance in key metrics and\nminimizes the effect of annotation noise. This novel approach uses domain\nguidance to detect offending annotations and deter rule-violating predictions\nin a self-supervised manner, thus making it more \"data efficient\" and domain\ncompliant. Empirical studies, performed over two large scale multi-label\nclassification datasets, demonstrate that our method results in improvement\nacross the board, and often entirely counteracts the effect of noise.\n","authors":["Soumadeep Saha","Utpal Garain","Arijit Ukil","Arpan Pal","Sundeep Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.05101v1.pdf","comment":"Submitted to IEEE TNNLS on March 7th 2023. 8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2210.10780v2","updated":"2023-08-09T17:48:40Z","published":"2022-10-18T21:15:33Z","title":"An out-of-distribution discriminator based on Bayesian neural network\n  epistemic uncertainty","summary":"  Neural networks have revolutionized the field of machine learning with\nincreased predictive capability. In addition to improving the predictions of\nneural networks, there is a simultaneous demand for reliable uncertainty\nquantification on estimates made by machine learning methods such as neural\nnetworks. Bayesian neural networks (BNNs) are an important type of neural\nnetwork with built-in capability for quantifying uncertainty. This paper\ndiscusses aleatoric and epistemic uncertainty in BNNs and how they can be\ncalculated. With an example dataset of images where the goal is to identify the\namplitude of an event in the image, it is shown that epistemic uncertainty\ntends to be lower in images which are well-represented in the training dataset\nand tends to be high in images which are not well-represented. An algorithm for\nout-of-distribution (OoD) detection with BNN epistemic uncertainty is\nintroduced along with various experiments demonstrating factors influencing the\nOoD detection capability in a BNN. The OoD detection capability with epistemic\nuncertainty is shown to be comparable to the OoD detection in the discriminator\nnetwork of a generative adversarial network (GAN) with comparable network\narchitecture.\n","authors":["Ethan Ancell","Christopher Bennett","Bert Debusschere","Sapan Agarwal","Park Hays","T. Patrick Xiao"],"pdf_url":"https://arxiv.org/pdf/2210.10780v2.pdf","comment":"26 pages, 25 figures"},{"id":"http://arxiv.org/abs/2308.05092v1","updated":"2023-08-09T17:40:12Z","published":"2023-08-09T17:40:12Z","title":"A degree of image identification at sub-human scales could be possible\n  with more advanced clusters","summary":"  The purpose of the research is to determine if currently available\nself-supervised learning techniques can accomplish human level comprehension of\nvisual images using the same degree and amount of sensory input that people\nacquire from. Initial research on this topic solely considered data volume\nscaling. Here, we scale both the volume of data and the quality of the image.\nThis scaling experiment is a self-supervised learning method that may be done\nwithout any outside financing. We find that scaling up data volume and picture\nresolution at the same time enables human-level item detection performance at\nsub-human sizes.We run a scaling experiment with vision transformers trained on\nup to 200000 images up to 256 ppi.\n","authors":["Prateek Y J"],"pdf_url":"https://arxiv.org/pdf/2308.05092v1.pdf","comment":"6 pages, 5 figures, public code and model:\n  https://github.com/PrateekJannu/imagescale2"},{"id":"http://arxiv.org/abs/2308.05075v1","updated":"2023-08-09T17:08:29Z","published":"2023-08-09T17:08:29Z","title":"Bayesian Inverse Transition Learning for Offline Settings","summary":"  Offline Reinforcement learning is commonly used for sequential\ndecision-making in domains such as healthcare and education, where the rewards\nare known and the transition dynamics $T$ must be estimated on the basis of\nbatch data. A key challenge for all tasks is how to learn a reliable estimate\nof the transition dynamics $T$ that produce near-optimal policies that are safe\nenough so that they never take actions that are far away from the best action\nwith respect to their value functions and informative enough so that they\ncommunicate the uncertainties they have. Using data from an expert, we propose\na new constraint-based approach that captures our desiderata for reliably\nlearning a posterior distribution of the transition dynamics $T$ that is free\nfrom gradients. Our results demonstrate that by using our constraints, we learn\na high-performing policy, while considerably reducing the policy's variance\nover different datasets. We also explain how combining uncertainty estimation\nwith these constraints can help us infer a partial ranking of actions that\nproduce higher returns, and helps us infer safer and more informative policies\nfor planning.\n","authors":["Leo Benac","Sonali Parbhoo","Finale Doshi-Velez"],"pdf_url":"https://arxiv.org/pdf/2308.05075v1.pdf","comment":"8 pages, 1 plots, 2 tables"},{"id":"http://arxiv.org/abs/2308.05061v1","updated":"2023-08-09T16:44:25Z","published":"2023-08-09T16:44:25Z","title":"Prompting In-Context Operator Learning with Sensor Data, Equations, and\n  Natural Language","summary":"  In the growing domain of scientific machine learning, in-context operator\nlearning has demonstrated notable potential in learning operators from prompted\ndata during inference stage without weight updates. However, the current\nmodel's overdependence on sensor data, may inadvertently overlook the\ninvaluable human insight into the operator. To address this, we present a\ntransformation of in-context operator learning into a multi-modal paradigm. We\npropose the use of \"captions\" to integrate human knowledge about the operator,\nexpressed through natural language descriptions and equations. We illustrate\nhow this method not only broadens the flexibility and generality of\nphysics-informed learning, but also significantly boosts learning performance\nand reduces data needs. Furthermore, we introduce a more efficient neural\nnetwork architecture for multi-modal in-context operator learning, referred to\nas \"ICON-LM\", based on a language-model-like architecture. We demonstrate the\nviability of \"ICON-LM\" for scientific machine learning tasks, which creates a\nnew path for the application of language models.\n","authors":["Liu Yang","Tingwei Meng","Siting Liu","Stanley J. Osher"],"pdf_url":"https://arxiv.org/pdf/2308.05061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05059v1","updated":"2023-08-09T16:41:00Z","published":"2023-08-09T16:41:00Z","title":"A Novel Method for improving accuracy in neural network by reinstating\n  traditional back propagation technique","summary":"  Deep learning has revolutionized industries like computer vision, natural\nlanguage processing, and speech recognition. However, back propagation, the\nmain method for training deep neural networks, faces challenges like\ncomputational overhead and vanishing gradients. In this paper, we propose a\nnovel instant parameter update methodology that eliminates the need for\ncomputing gradients at each layer. Our approach accelerates learning, avoids\nthe vanishing gradient problem, and outperforms state-of-the-art methods on\nbenchmark data sets. This research presents a promising direction for efficient\nand effective deep neural network training.\n","authors":["Gokulprasath R"],"pdf_url":"https://arxiv.org/pdf/2308.05059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00824v3","updated":"2023-08-09T16:25:24Z","published":"2023-08-01T20:22:53Z","title":"An Exact Kernel Equivalence for Finite Classification Models","summary":"  We explore the equivalence between neural networks and kernel methods by\nderiving the first exact representation of any finite-size parametric\nclassification model trained with gradient descent as a kernel machine. We\ncompare our exact representation to the well-known Neural Tangent Kernel (NTK)\nand discuss approximation error relative to the NTK and other non-exact path\nkernel formulations. We experimentally demonstrate that the kernel can be\ncomputed for realistic networks up to machine precision. We use this exact\nkernel to show that our theoretical contribution can provide useful insights\ninto the predictions made by neural networks, particularly the way in which\nthey generalize.\n","authors":["Brian Bell","Michael Geyer","David Glickenstein","Amanda Fernandez","Juston Moore"],"pdf_url":"https://arxiv.org/pdf/2308.00824v3.pdf","comment":"TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in\n  Appendix"},{"id":"http://arxiv.org/abs/2308.05046v1","updated":"2023-08-09T16:19:43Z","published":"2023-08-09T16:19:43Z","title":"RadGraph2: Modeling Disease Progression in Radiology Reports via\n  Hierarchical Information Extraction","summary":"  We present RadGraph2, a novel dataset for extracting information from\nradiology reports that focuses on capturing changes in disease state and device\nplacement over time. We introduce a hierarchical schema that organizes entities\nbased on their relationships and show that using this hierarchy during training\nimproves the performance of an information extraction model. Specifically, we\npropose a modification to the DyGIE++ framework, resulting in our model HGIE,\nwhich outperforms previous models in entity and relation extraction tasks. We\ndemonstrate that RadGraph2 enables models to capture a wider variety of\nfindings and perform better at relation extraction compared to those trained on\nthe original RadGraph dataset. Our work provides the foundation for developing\nautomated systems that can track disease progression over time and develop\ninformation extraction models that leverage the natural hierarchy of labels in\nthe medical domain.\n","authors":["Sameer Khanna","Adam Dejl","Kibo Yoon","Quoc Hung Truong","Hanh Duong","Agustina Saenz","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2308.05046v1.pdf","comment":"Accepted at Machine Learning for Healthcare 2023"},{"id":"http://arxiv.org/abs/2302.00722v2","updated":"2023-08-09T16:17:45Z","published":"2023-02-01T19:34:55Z","title":"A Survey of Deep Learning: From Activations to Transformers","summary":"  The past decade has witnessed remarkable advancements in deep learning, owing\nto the emergence of various architectures, layers, objectives, and optimization\ntechniques. These consist of a multitude of variations of attention,\nnormalization, skip connections, transformer, and self-supervised learning\nmethods, among others. Our goal is to furnish a comprehensive survey of\nsignificant recent contributions in these domains to individuals with a\nfundamental grasp of deep learning. Our aspiration is that an integrated and\ncomprehensive approach of influential recent works will facilitate the\nformation of new connections between different areas of deep learning. In our\ndiscussion, we discuss multiple patterns that summarize the key strategies for\nmany of the successful innovations over the last decade. We also include a\ndiscussion on recent commercially built, closed-source models such as OpenAI's\nGPT-4 and Google's PaLM 2.\n","authors":["Johannes Schneider","Michalis Vlachos"],"pdf_url":"https://arxiv.org/pdf/2302.00722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1911.12965v2","updated":"2023-08-09T16:17:24Z","published":"2019-11-29T06:25:36Z","title":"Sparse and Low-Rank High-Order Tensor Regression via Parallel Proximal\n  Method","summary":"  Recently, tensor data (or multidimensional array) have been generated in many\nmodern applications, such as functional magnetic resonance imaging (fMRI) in\nneuroscience and videos in video analysis. Many efforts are made in recent\nyears to predict the relationship between tensor features and univariate\nresponses. However, previously proposed methods either lose structural\ninformation within tensor data or have prohibitively expensive time costs,\nespecially for large-scale data with high-order structures. To address such\nproblems, we propose the Sparse and Low-rank Tensor Regression (SLTR) model.\nOur model enforces sparsity and low-rankness of the tensor coefficient by\ndirectly applying $\\ell_1$ norm and tensor nuclear norm, such that it preserves\nstructural information of the tensor. To make the solving procedure scalable\nand efficient, SLTR makes use of the proximal gradient method, which can be\neasily implemented parallelly. We evaluate SLTR on several simulated datasets\nand one video action recognition dataset. Experiment results show that,\ncompared with previous models, SLTR can obtain a better solution with much\nfewer time costs. Moreover, our model's predictions exhibit meaningful\ninterpretations on the video dataset.\n","authors":["Jiaqi Zhang","Yinghao Cai","Zhaoyang Wang","Beilun Wang"],"pdf_url":"https://arxiv.org/pdf/1911.12965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08992v2","updated":"2023-08-09T16:13:00Z","published":"2023-05-15T20:17:03Z","title":"The Brain Tumor Segmentation (BraTS) Challenge 2023: Local Synthesis of\n  Healthy Brain Tissue via Inpainting","summary":"  A myriad of algorithms for the automatic analysis of brain MR images is\navailable to support clinicians in their decision-making. For brain tumor\npatients, the image acquisition time series typically starts with a scan that\nis already pathological. This poses problems, as many algorithms are designed\nto analyze healthy brains and provide no guarantees for images featuring\nlesions. Examples include but are not limited to algorithms for brain anatomy\nparcellation, tissue segmentation, and brain extraction. To solve this dilemma,\nwe introduce the BraTS 2023 inpainting challenge. Here, the participants' task\nis to explore inpainting techniques to synthesize healthy brain scans from\nlesioned ones. The following manuscript contains the task formulation, dataset,\nand submission procedure. Later it will be updated to summarize the findings of\nthe challenge. The challenge is organized as part of the BraTS 2023 challenge\nhosted at the MICCAI 2023 conference in Vancouver, Canada.\n","authors":["Florian Kofler","Felix Meissen","Felix Steinbauer","Robert Graf","Eva Oswald","Ezequiel de da Rosa","Hongwei Bran Li","Ujjwal Baid","Florian Hoelzl","Oezguen Turgut","Izabela Horvath","Diana Waldmannstetter","Christina Bukas","Maruf Adewole","Syed Muhammad Anwar","Anastasia Janas","Anahita Fathi Kazerooni","Dominic LaBella","Ahmed W Moawad","Keyvan Farahani","James Eddy","Timothy Bergquist","Verena Chung","Russell Takeshi Shinohara","Farouk Dako","Walter Wiggins","Zachary Reitman","Chunhao Wang","Xinyang Liu","Zhifan Jiang","Ariana Familiar","Gian-Marco Conte","Elaine Johanson","Zeke Meier","Christos Davatzikos","John Freymann","Justin Kirby","Michel Bilello","Hassan M Fathallah-Shaykh","Roland Wiest","Jan Kirschke","Rivka R Colen","Aikaterini Kotrotsou","Pamela Lamontagne","Daniel Marcus","Mikhail Milchenko","Arash Nazeri","Marc-André Weber","Abhishek Mahajan","Suyash Mohan","John Mongan","Christopher Hess","Soonmee Cha","Javier Villanueva-Meyer","Errol Colak","Priscila Crivellaro","Andras Jakab","Jake Albrecht","Udunna Anazodo","Mariam Aboian","Juan Eugenio Iglesias","Koen Van Leemput","Spyridon Bakas","Daniel Rueckert","Benedikt Wiestler","Ivan Ezhov","Marie Piraud","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2305.08992v2.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.05036v1","updated":"2023-08-09T16:08:44Z","published":"2023-08-09T16:08:44Z","title":"Collaborative Wideband Spectrum Sensing and Scheduling for Networked\n  UAVs in UTM Systems","summary":"  In this paper, we propose a data-driven framework for collaborative wideband\nspectrum sensing and scheduling for networked unmanned aerial vehicles (UAVs),\nwhich act as the secondary users to opportunistically utilize detected spectrum\nholes. To this end, we propose a multi-class classification problem for\nwideband spectrum sensing to detect vacant spectrum spots based on collected\nI/Q samples. To enhance the accuracy of the spectrum sensing module, the\noutputs from the multi-class classification by each individual UAV are fused at\na server in the unmanned aircraft system traffic management (UTM) ecosystem. In\nthe spectrum scheduling phase, we leverage reinforcement learning (RL)\nsolutions to dynamically allocate the detected spectrum holes to the secondary\nusers (i.e., UAVs). To evaluate the proposed methods, we establish a\ncomprehensive simulation framework that generates a near-realistic synthetic\ndataset using MATLAB LTE toolbox by incorporating base-station~(BS) locations\nin a chosen area of interest, performing ray-tracing, and emulating the primary\nusers channel usage in terms of I/Q samples. This evaluation methodology\nprovides a flexible framework to generate large spectrum datasets that could be\nused for developing ML/AI-based spectrum management solutions for aerial\ndevices.\n","authors":["Sravan Reddy Chintareddy","Keenan Roach","Kenny Cheung","Morteza Hashemi"],"pdf_url":"https://arxiv.org/pdf/2308.05036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05034v1","updated":"2023-08-09T16:04:55Z","published":"2023-08-09T16:04:55Z","title":"Kairos: : Practical Intrusion Detection and Investigation using\n  Whole-system Provenance","summary":"  Provenance graphs are structured audit logs that describe the history of a\nsystem's execution. Recent studies have explored a variety of techniques to\nanalyze provenance graphs for automated host intrusion detection, focusing\nparticularly on advanced persistent threats. Sifting through their design\ndocuments, we identify four common dimensions that drive the development of\nprovenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect\nmodern attacks that infiltrate across application boundaries?), attack\nagnosticity (can PIDSes detect novel attacks without a priori knowledge of\nattack characteristics?), timeliness (can PIDSes efficiently monitor host\nsystems as they run?), and attack reconstruction (can PIDSes distill attack\nactivity from large provenance graphs so that sysadmins can easily understand\nand quickly respond to system intrusion?). We present KAIROS, the first PIDS\nthat simultaneously satisfies the desiderata in all four dimensions, whereas\nexisting approaches sacrifice at least one and struggle to achieve comparable\ndetection performance.\n  Kairos leverages a novel graph neural network-based encoder-decoder\narchitecture that learns the temporal evolution of a provenance graph's\nstructural changes to quantify the degree of anomalousness for each system\nevent. Then, based on this fine-grained information, Kairos reconstructs attack\nfootprints, generating compact summary graphs that accurately describe\nmalicious activity over a stream of system audit logs. Using state-of-the-art\nbenchmark datasets, we demonstrate that Kairos outperforms previous approaches.\n","authors":["Zijun Cheng","Qiujian Lv","Jinyuan Liang","Yan Wang","Degang Sun","Thomas Pasquier","Xueyuan Han"],"pdf_url":"https://arxiv.org/pdf/2308.05034v1.pdf","comment":"23 pages, 16 figures, to appear in the 45th IEEE Symposium on\n  Security and Privacy (S&P'24)"},{"id":"http://arxiv.org/abs/2308.03999v2","updated":"2023-08-09T15:59:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations Using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, demystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process.\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05032v1","updated":"2023-08-09T15:59:42Z","published":"2023-08-09T15:59:42Z","title":"Density Crop-guided Semi-supervised Object Detection in Aerial Images","summary":"  One of the important bottlenecks in training modern object detectors is the\nneed for labeled images where bounding box annotations have to be produced for\neach object present in the image. This bottleneck is further exacerbated in\naerial images where the annotators have to label small objects often\ndistributed in clusters on high-resolution images. In recent days, the\nmean-teacher approach trained with pseudo-labels and weak-strong augmentation\nconsistency is gaining popularity for semi-supervised object detection.\nHowever, a direct adaptation of such semi-supervised detectors for aerial\nimages where small clustered objects are often present, might not lead to\noptimal results. In this paper, we propose a density crop-guided\nsemi-supervised detector that identifies the cluster of small objects during\ntraining and also exploits them to improve performance at inference. During\ntraining, image crops of clusters identified from labeled and unlabeled images\nare used to augment the training set, which in turn increases the chance of\ndetecting small objects and creating good pseudo-labels for small objects on\nthe unlabeled images. During inference, the detector is not only able to detect\nthe objects of interest but also regions with a high density of small objects\n(density crops) so that detections from the input image and detections from\nimage crops are combined, resulting in an overall more accurate object\nprediction, especially for small objects. Empirical studies on the popular\nbenchmarks of VisDrone and DOTA datasets show the effectiveness of our density\ncrop-guided semi-supervised detector with an average improvement of more than\n2\\% over the basic mean-teacher method in COCO style AP. Our code is available\nat: https://github.com/akhilpm/DroneSSOD.\n","authors":["Akhil Meethal","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2308.05032v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.04898v1","updated":"2023-08-09T15:35:14Z","published":"2023-08-09T15:35:14Z","title":"An Empirical Study on Using Large Language Models to Analyze Software\n  Supply Chain Security Failures","summary":"  As we increasingly depend on software systems, the consequences of breaches\nin the software supply chain become more severe. High-profile cyber attacks\nlike those on SolarWinds and ShadowHammer have resulted in significant\nfinancial and data losses, underlining the need for stronger cybersecurity. One\nway to prevent future breaches is by studying past failures. However,\ntraditional methods of analyzing these failures require manually reading and\nsummarizing reports about them. Automated support could reduce costs and allow\nanalysis of more failures. Natural Language Processing (NLP) techniques such as\nLarge Language Models (LLMs) could be leveraged to assist the analysis of\nfailures. In this study, we assessed the ability of Large Language Models\n(LLMs) to analyze historical software supply chain breaches. We used LLMs to\nreplicate the manual analysis of 69 software supply chain security failures\nperformed by members of the Cloud Native Computing Foundation (CNCF). We\ndeveloped prompts for LLMs to categorize these by four dimensions: type of\ncompromise, intent, nature, and impact. GPT 3.5s categorizations had an average\naccuracy of 68% and Bard had an accuracy of 58% over these dimensions. We\nreport that LLMs effectively characterize software supply chain failures when\nthe source articles are detailed enough for consensus among manual analysts,\nbut cannot yet replace human analysts. Future work can improve LLM performance\nin this context, and study a broader range of articles and failures.\n","authors":["Tanmay Singla","Dharun Anandayuvaraj","Kelechi G. Kalu","Taylor R. Schorlemmer","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2308.04898v1.pdf","comment":"22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.05021v1","updated":"2023-08-09T15:31:17Z","published":"2023-08-09T15:31:17Z","title":"Do Diffusion Models Suffer Error Propagation? Theoretical Analysis and\n  Consistency Regularization","summary":"  While diffusion models have achieved promising performances in data\nsynthesis, they might suffer error propagation because of their cascade\nstructure, where the distributional mismatch spreads and magnifies through the\nchain of denoising modules. However, a strict analysis is expected since many\nsequential models such as Conditional Random Field (CRF) are free from error\npropagation. In this paper, we empirically and theoretically verify that\ndiffusion models are indeed affected by error propagation and we then propose a\nregularization to address this problem. Our theoretical analysis reveals that\nthe question can be reduced to whether every denoising module of the diffusion\nmodel is fault-tolerant. We derive insightful transition equations, indicating\nthat the module can't recover from input errors and even propagates additional\nerrors to the next module. Our analysis directly leads to a consistency\nregularization scheme for diffusion models, which explicitly reduces the\ndistribution gap between forward and backward processes. We further introduce a\nbootstrapping algorithm to reduce the computation cost of the regularizer. Our\nexperimental results on multiple image datasets show that our regularization\neffectively handles error propagation and significantly improves the\nperformance of vanilla diffusion models.\n","authors":["Yangming Li","Zhaozhi Qian","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2308.05021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05017v1","updated":"2023-08-09T15:27:21Z","published":"2023-08-09T15:27:21Z","title":"When and How Does Known Class Help Discover Unknown Ones? Provable\n  Understanding Through Spectral Analysis","summary":"  Novel Class Discovery (NCD) aims at inferring novel classes in an unlabeled\nset by leveraging prior knowledge from a labeled set with known classes.\nDespite its importance, there is a lack of theoretical foundations for NCD.\nThis paper bridges the gap by providing an analytical framework to formalize\nand investigate when and how known classes can help discover novel classes.\nTailored to the NCD problem, we introduce a graph-theoretic representation that\ncan be learned by a novel NCD Spectral Contrastive Loss (NSCL). Minimizing this\nobjective is equivalent to factorizing the graph's adjacency matrix, which\nallows us to derive a provable error bound and provide the sufficient and\nnecessary condition for NCD. Empirically, NSCL can match or outperform several\nstrong baselines on common benchmark datasets, which is appealing for practical\nusage while enjoying theoretical guarantees.\n","authors":["Yiyou Sun","Zhenmei Shi","Yingyu Liang","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2308.05017v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2303.17566v4","updated":"2023-08-09T15:17:33Z","published":"2023-03-30T17:30:42Z","title":"Non-Invasive Fairness in Learning through the Lens of Data Drift","summary":"  Machine Learning (ML) models are widely employed to drive many modern data\nsystems. While they are undeniably powerful tools, ML models often demonstrate\nimbalanced performance and unfair behaviors. The root of this problem often\nlies in the fact that different subpopulations commonly display divergent\ntrends: as a learning algorithm tries to identify trends in the data, it\nnaturally favors the trends of the majority groups, leading to a model that\nperforms poorly and unfairly for minority populations. Our goal is to improve\nthe fairness and trustworthiness of ML models by applying only non-invasive\ninterventions, i.e., without altering the data or the learning algorithm. We\nuse a simple but key insight: the divergence of trends between different\npopulations, and, consecutively, between a learned model and minority\npopulations, is analogous to data drift, which indicates the poor conformance\nbetween parts of the data and the trained model. We explore two strategies\n(model-splitting and reweighing) to resolve this drift, aiming to improve the\noverall conformance of models to the underlying data. Both our methods\nintroduce novel ways to employ the recently-proposed data profiling primitive\nof Conformance Constraints. Our experimental evaluation over 7 real-world\ndatasets shows that both DifFair and ConFair improve the fairness of ML models.\nWe demonstrate scenarios where DifFair has an edge, though ConFair has the\ngreatest practical impact and outperforms other baselines. Moreover, as a\nmodel-agnostic technique, ConFair stays robust when used against different\nmodels than the ones on which the weights have been learned, which is not the\ncase for other state of the art.\n","authors":["Ke Yang","Alexandra Meliou"],"pdf_url":"https://arxiv.org/pdf/2303.17566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05014v1","updated":"2023-08-09T15:14:16Z","published":"2023-08-09T15:14:16Z","title":"An Empirical Study of Bugs in Open-Source Federated Learning Framework","summary":"  Federated learning (FL), as a decentralized machine learning solution to the\nprotection of users' private data, has become an important learning paradigm in\nrecent years, especially since the enforcement of stricter laws and regulations\nin most countries. Therefore, a variety of FL frameworks are released to\nfacilitate the development and application of federated learning. Despite the\nconsiderable amount of research on the security and privacy of FL models and\nsystems, the security issues in FL frameworks have not been systematically\nstudied yet. In this paper, we conduct the first empirical study on 1,112 FL\nframework bugs to investigate their characteristics. These bugs are manually\ncollected, classified, and labeled from 12 open-source FL frameworks on GitHub.\nIn detail, we construct taxonomies of 15 symptoms, 12 root causes, and 20 fix\npatterns of these bugs and investigate their correlations and distributions on\n23 logical components and two main application scenarios. From the results of\nour study, we present nine findings, discuss their implications, and propound\nseveral suggestions to FL framework developers and security researchers on the\nFL frameworks.\n","authors":["Weijie Shao","Yuyang Gao","Fu Song","Sen Chen","Lingling Fan"],"pdf_url":"https://arxiv.org/pdf/2308.05014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05011v1","updated":"2023-08-09T15:10:53Z","published":"2023-08-09T15:10:53Z","title":"Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with\n  Distinct Inlier Categories","summary":"  With the increasing volume of astronomical data generated by modern survey\ntelescopes, automated pipelines and machine learning techniques have become\ncrucial for analyzing and extracting knowledge from these datasets. Anomaly\ndetection, i.e. the task of identifying irregular or unexpected patterns in the\ndata, is a complex challenge in astronomy. In this paper, we propose\nMulti-Class Deep Support Vector Data Description (MCDSVDD), an extension of the\nstate-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically\ndesigned to handle different inlier categories with distinct data\ndistributions. MCDSVDD uses a neural network to map the data into hyperspheres,\nwhere each hypersphere represents a specific inlier category. The distance of\neach sample from the centers of these hyperspheres determines the anomaly\nscore. We evaluate the effectiveness of MCDSVDD by comparing its performance\nwith several anomaly detection algorithms on a large dataset of astronomical\nlight-curves obtained from the Zwicky Transient Facility. Our results\ndemonstrate the efficacy of MCDSVDD in detecting anomalous sources while\nleveraging the presence of different inlier categories. The code and the data\nneeded to reproduce our results are publicly available at\nhttps://github.com/mperezcarrasco/AnomalyALeRCE.\n","authors":["Pérez-Carrasco Manuel","Cabrera-Vives Guillermo","Hernández-García Lorena","Forster Francisco","Sánchez-Sáez Paula","Muñoz Arancibia Alejandra","Astorga Nicolás","Bauer Franz","Bayo Amelia","Cádiz-Leyton Martina","Catelan Marcio"],"pdf_url":"https://arxiv.org/pdf/2308.05011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02041v2","updated":"2023-08-09T15:07:42Z","published":"2023-05-03T11:11:46Z","title":"Low-complexity subspace-descent over symmetric positive definite\n  manifold","summary":"  This work puts forth low-complexity Riemannian subspace descent algorithms\nfor the minimization of functions over the symmetric positive definite (SPD)\nmanifold. Different from the existing Riemannian gradient descent variants, the\nproposed approach utilizes carefully chosen subspaces that allow the update to\nbe written as a product of the Cholesky factor of the iterate and a sparse\nmatrix. The resulting updates avoid the costly matrix operations like matrix\nexponentiation and dense matrix multiplication, which are generally required in\nalmost all other Riemannian optimization algorithms on SPD manifold. We further\nidentify a broad class of functions, arising in diverse applications, such as\nkernel matrix learning, covariance estimation of Gaussian distributions,\nmaximum likelihood parameter estimation of elliptically contoured\ndistributions, and parameter estimation in Gaussian mixture model problems,\nover which the Riemannian gradients can be calculated efficiently. The proposed\nuni-directional and multi-directional Riemannian subspace descent variants\nincur per-iteration complexities of $\\mathcal{O}(n)$ and $\\mathcal{O}(n^2)$\nrespectively, as compared to the $\\mathcal{O}(n^3)$ or higher complexity\nincurred by all existing Riemannian gradient descent variants. The superior\nruntime and low per-iteration complexity of the proposed algorithms is also\ndemonstrated via numerical tests on large-scale covariance estimation problems.\n","authors":["Yogesh Darmwal","Ketan Rajawat"],"pdf_url":"https://arxiv.org/pdf/2305.02041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14102v2","updated":"2023-08-09T15:05:42Z","published":"2023-02-27T19:26:48Z","title":"Connectivity Optimized Nested Graph Networks for Crystal Structures","summary":"  Graph neural networks (GNNs) have been applied to a large variety of\napplications in materials science and chemistry. Here, we recapitulate the\ngraph construction for crystalline (periodic) materials and investigate its\nimpact on the GNNs model performance. We suggest the asymmetric unit cell as a\nrepresentation to reduce the number of atoms by using all symmetries of the\nsystem. This substantially reduced the computational cost and thus time needed\nto train large graph neural networks without any loss in accuracy. Furthermore,\nwith a simple but systematically built GNN architecture based on message\npassing and line graph templates, we introduce a general architecture (Nested\nGraph Network, NGN) that is applicable to a wide range of tasks. We show that\nour suggested models systematically improve state-of-the-art results across all\ntasks within the MatBench benchmark. Further analysis shows that optimized\nconnectivity and deeper message functions are responsible for the improvement.\nAsymmetric unit cells and connectivity optimization can be generally applied to\n(crystal) graph networks, while our suggested nested graph framework will open\nnew ways of systematic comparison of GNN architectures.\n","authors":["Robin Ruff","Patrick Reiser","Jan Stühmer","Pascal Friederich"],"pdf_url":"https://arxiv.org/pdf/2302.14102v2.pdf","comment":"19 pages, 13 figures"},{"id":"http://arxiv.org/abs/2108.13672v3","updated":"2023-08-09T14:27:18Z","published":"2021-08-31T08:23:56Z","title":"SANSformers: Self-Supervised Forecasting in Electronic Health Records\n  with Attention-Free Models","summary":"  The application of Transformer neural networks to Electronic Health Records\n(EHR) is challenging due to the distinct, multidimensional sequential structure\nof EHR data, often leading to underperformance when compared to simpler linear\nmodels. Thus, the advantages of Transformers, such as efficient transfer\nlearning and improved scalability are not fully exploited in EHR applications.\nTo overcome these challenges, we introduce SANSformer, a novel attention-free\nsequential model designed specifically with inductive biases to cater for the\nunique characteristics of EHR data.\n  Our main application area is predicting future healthcare utilization, a\ncrucial task for effectively allocating healthcare resources. This task becomes\nparticularly difficult when dealing with divergent patient subgroups. These\nsubgroups, characterized by unique health trajectories and often small in size,\nsuch as patients with rare diseases, require specialized modeling approaches.\nTo address this, we adopt a self-supervised pretraining strategy, which we term\nGenerative Summary Pretraining (GSP). GSP predicts summary statistics of a\nfuture window in the patient's history based on their past health records, thus\ndemonstrating potential to deal with the noisy and complex nature of EHR data.\nWe pretrain our models on a comprehensive health registry encompassing close to\none million patients, before fine-tuning them for specific subgroup prediction\ntasks.\n  In our evaluations, SANSformer consistently outshines strong EHR baselines.\nImportantly, our GSP pretraining method greatly enhances model performance,\nespecially for smaller patient subgroups. Our findings underscore the\nsubstantial potential of bespoke attention-free models and self-supervised\npretraining for enhancing healthcare utilization predictions across a broad\nrange of patient groups.\n","authors":["Yogesh Kumar","Alexander Ilin","Henri Salo","Sangita Kulathinal","Maarit K. Leinonen","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2108.13672v3.pdf","comment":"25 pages, 8 figures, 5 tables, Submitted to a journal"},{"id":"http://arxiv.org/abs/2308.04978v1","updated":"2023-08-09T14:22:18Z","published":"2023-08-09T14:22:18Z","title":"Transferable Models for Bioacoustics with Human Language Supervision","summary":"  Passive acoustic monitoring offers a scalable, non-invasive method for\ntracking global biodiversity and anthropogenic impacts on species. Although\ndeep learning has become a vital tool for processing this data, current models\nare inflexible, typically cover only a handful of species, and are limited by\ndata scarcity. In this work, we propose BioLingual, a new model for\nbioacoustics based on contrastive language-audio pretraining. We first\naggregate bioacoustic archives into a language-audio dataset, called\nAnimalSpeak, with over a million audio-caption pairs holding information on\nspecies, vocalization context, and animal behavior. After training on this\ndataset to connect language and audio representations, our model can identify\nover a thousand species' calls across taxa, complete bioacoustic tasks\nzero-shot, and retrieve animal vocalization recordings from natural text\nqueries. When fine-tuned, BioLingual sets a new state-of-the-art on nine tasks\nin the Benchmark of Animal Sounds. Given its broad taxa coverage and ability to\nbe flexibly queried in human language, we believe this model opens new\nparadigms in ecological monitoring and research, including free-text search on\nthe world's acoustic monitoring archives. We open-source our models, dataset,\nand code.\n","authors":["David Robinson","Adelaide Robinson","Lily Akrapongpisak"],"pdf_url":"https://arxiv.org/pdf/2308.04978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10445v3","updated":"2023-08-09T14:02:55Z","published":"2022-12-20T17:21:46Z","title":"Model Ratatouille: Recycling Diverse Models for Out-of-Distribution\n  Generalization","summary":"  Foundation models are redefining how AI systems are built. Practitioners now\nfollow a standard procedure to build their machine learning solutions: from a\npre-trained foundation model, they fine-tune the weights on the target task of\ninterest. So, the Internet is swarmed by a handful of foundation models\nfine-tuned on many diverse tasks: these individual fine-tunings exist in\nisolation without benefiting from each other. In our opinion, this is a missed\nopportunity, as these specialized models contain rich and diverse features. In\nthis paper, we thus propose model ratatouille, a new strategy to recycle the\nmultiple fine-tunings of the same foundation model on diverse auxiliary tasks.\nSpecifically, we repurpose these auxiliary weights as initializations for\nmultiple parallel fine-tunings on the target task; then, we average all\nfine-tuned weights to obtain the final model. This recycling strategy aims at\nmaximizing the diversity in weights by leveraging the diversity in auxiliary\ntasks. Empirically, it improves the state of the art on the reference DomainBed\nbenchmark for out-of-distribution generalization. Looking forward, this work\ncontributes to the emerging paradigm of updatable machine learning where, akin\nto open-source software development, the community collaborates to reliably\nupdate machine learning models. Our code is released:\nhttps://github.com/facebookresearch/ModelRatatouille.\n","authors":["Alexandre Ramé","Kartik Ahuja","Jianyu Zhang","Matthieu Cord","Léon Bottou","David Lopez-Paz"],"pdf_url":"https://arxiv.org/pdf/2212.10445v3.pdf","comment":"24 pages, 10 tables, 21 figures"},{"id":"http://arxiv.org/abs/2308.04964v1","updated":"2023-08-09T13:58:03Z","published":"2023-08-09T13:58:03Z","title":"Adversarial ModSecurity: Countering Adversarial SQL Injections with\n  Robust Machine Learning","summary":"  ModSecurity is widely recognized as the standard open-source Web Application\nFirewall (WAF), maintained by the OWASP Foundation. It detects malicious\nrequests by matching them against the Core Rule Set, identifying well-known\nattack patterns. Each rule in the CRS is manually assigned a weight, based on\nthe severity of the corresponding attack, and a request is detected as\nmalicious if the sum of the weights of the firing rules exceeds a given\nthreshold. In this work, we show that this simple strategy is largely\nineffective for detecting SQL injection (SQLi) attacks, as it tends to block\nmany legitimate requests, while also being vulnerable to adversarial SQLi\nattacks, i.e., attacks intentionally manipulated to evade detection. To\novercome these issues, we design a robust machine learning model, named\nAdvModSec, which uses the CRS rules as input features, and it is trained to\ndetect adversarial SQLi attacks. Our experiments show that AdvModSec, being\ntrained on the traffic directed towards the protected web services, achieves a\nbetter trade-off between detection and false positive rates, improving the\ndetection rate of the vanilla version of ModSecurity with CRS by 21%. Moreover,\nour approach is able to improve its adversarial robustness against adversarial\nSQLi attacks by 42%, thereby taking a step forward towards building more robust\nand trustworthy WAFs.\n","authors":["Biagio Montaruli","Luca Demetrio","Andrea Valenza","Battista Biggio","Luca Compagna","Davide Balzarotti","Davide Ariu","Luca Piras"],"pdf_url":"https://arxiv.org/pdf/2308.04964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04396v2","updated":"2023-08-09T13:56:09Z","published":"2023-08-08T17:00:30Z","title":"Event Abstraction for Enterprise Collaboration Systems to Support Social\n  Process Mining","summary":"  One aim of Process Mining (PM) is the discovery of process models from event\nlogs of information systems. PM has been successfully applied to\nprocess-oriented enterprise systems but is less suited for communication- and\ndocument-oriented Enterprise Collaboration Systems (ECS). ECS event logs are\nvery fine-granular and PM applied to their logs results in spaghetti models. A\ncommon solution for this is event abstraction, i.e., converting low-level logs\ninto more abstract high-level logs before running discovery algorithms. ECS\nlogs come with special characteristics that have so far not been fully\naddressed by existing event abstraction approaches. We aim to close this gap\nwith a tailored ECS event abstraction (ECSEA) approach that trains a model by\ncomparing recorded actual user activities (high-level traces) with the\nsystem-generated low-level traces (extracted from the ECS). The model allows us\nto automatically convert future low-level traces into an abstracted high-level\nlog that can be used for PM. Our evaluation shows that the algorithm produces\naccurate results. ECSEA is a preprocessing method that is essential for the\ninterpretation of collaborative work activity in ECS, which we call Social\nProcess Mining.\n","authors":["Jonas Blatt","Patrick Delfmann","Petra Schubert"],"pdf_url":"https://arxiv.org/pdf/2308.04396v2.pdf","comment":"8 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2308.04961v1","updated":"2023-08-09T13:52:41Z","published":"2023-08-09T13:52:41Z","title":"CasCIFF: A Cross-Domain Information Fusion Framework Tailored for\n  Cascade Prediction in Social Networks","summary":"  Existing approaches for information cascade prediction fall into three main\ncategories: feature-driven methods, point process-based methods, and deep\nlearning-based methods. Among them, deep learning-based methods, characterized\nby its superior learning and representation capabilities, mitigates the\nshortcomings inherent of the other methods. However, current deep learning\nmethods still face several persistent challenges. In particular, accurate\nrepresentation of user attributes remains problematic due to factors such as\nfake followers and complex network configurations. Previous algorithms that\nfocus on the sequential order of user activations often neglect the rich\ninsights offered by activation timing. Furthermore, these techniques often fail\nto holistically integrate temporal and structural aspects, thus missing the\nnuanced propagation trends inherent in information cascades.To address these\nissues, we propose the Cross-Domain Information Fusion Framework (CasCIFF),\nwhich is tailored for information cascade prediction. This framework exploits\nmulti-hop neighborhood information to make user embeddings robust. When\nembedding cascades, the framework intentionally incorporates timestamps,\nendowing it with the ability to capture evolving patterns of information\ndiffusion. In particular, the CasCIFF seamlessly integrates the tasks of user\nclassification and cascade prediction into a consolidated framework, thereby\nallowing the extraction of common features that prove useful for all tasks, a\nstrategy anchored in the principles of multi-task learning.\n","authors":["Hongjun Zhu","Shun Yuan","Xin Liu","Kuo Chen","Chaolong Jia","Ying Qian"],"pdf_url":"https://arxiv.org/pdf/2308.04961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00365v2","updated":"2023-08-09T13:51:28Z","published":"2023-07-01T15:26:08Z","title":"Understanding recent deep-learning techniques for identifying collective\n  variables of molecular dynamics","summary":"  High-dimensional metastable molecular system can often be characterised by a\nfew features of the system, i.e. collective variables (CVs). Thanks to the\nrapid advance in the area of machine learning and deep learning, various deep\nlearning-based CV identification techniques have been developed in recent\nyears, allowing accurate modelling and efficient simulation of complex\nmolecular systems. In this paper, we look at two different categories of deep\nlearning-based approaches for finding CVs, either by computing leading\neigenfunctions of infinitesimal generator or transfer operator associated to\nthe underlying dynamics, or by learning an autoencoder via minimisation of\nreconstruction error. We present a concise overview of the mathematics behind\nthese two approaches and conduct a comparative numerical study of these two\napproaches on illustrative examples.\n","authors":["Wei Zhang","Christof Schütte"],"pdf_url":"https://arxiv.org/pdf/2307.00365v2.pdf","comment":"revised version, 14 pages; This is an extended version of the paper\n  submitted to Proceedings in Applied Mathematics and Mechanics (PAMM) 2023"},{"id":"http://arxiv.org/abs/2308.04960v1","updated":"2023-08-09T13:50:00Z","published":"2023-08-09T13:50:00Z","title":"Representation Learning for Audio Privacy Preservation using Source\n  Separation and Robust Adversarial Learning","summary":"  Privacy preservation has long been a concern in smart acoustic monitoring\nsystems, where speech can be passively recorded along with a target signal in\nthe system's operating environment. In this study, we propose the integration\nof two commonly used approaches in privacy preservation: source separation and\nadversarial representation learning. The proposed system learns the latent\nrepresentation of audio recordings such that it prevents differentiating\nbetween speech and non-speech recordings. Initially, the source separation\nnetwork filters out some of the privacy-sensitive data, and during the\nadversarial learning process, the system will learn privacy-preserving\nrepresentation on the filtered signal. We demonstrate the effectiveness of our\nproposed method by comparing our method against systems without source\nseparation, without adversarial learning, and without both. Overall, our\nresults suggest that the proposed system can significantly improve speech\nprivacy preservation compared to that of using source separation or adversarial\nlearning solely while maintaining good performance in the acoustic monitoring\ntask.\n","authors":["Diep Luong","Minh Tran","Shayan Gharib","Konstantinos Drossos","Tuomas Virtanen"],"pdf_url":"https://arxiv.org/pdf/2308.04960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04958v1","updated":"2023-08-09T13:44:35Z","published":"2023-08-09T13:44:35Z","title":"Improving Autonomous Separation Assurance through Distributed\n  Reinforcement Learning with Attention Networks","summary":"  Advanced Air Mobility (AAM) introduces a new, efficient mode of\ntransportation with the use of vehicle autonomy and electrified aircraft to\nprovide increasingly autonomous transportation between previously underserved\nmarkets. Safe and efficient navigation of low altitude aircraft through highly\ndense environments requires the integration of a multitude of complex\nobservations, such as surveillance, knowledge of vehicle dynamics, and weather.\nThe processing and reasoning on these observations pose challenges due to the\nvarious sources of uncertainty in the information while ensuring cooperation\nwith a variable number of aircraft in the airspace. These challenges coupled\nwith the requirement to make safety-critical decisions in real-time rule out\nthe use of conventional separation assurance techniques. We present a\ndecentralized reinforcement learning framework to provide autonomous\nself-separation capabilities within AAM corridors with the use of speed and\nvertical maneuvers. The problem is formulated as a Markov Decision Process and\nsolved by developing a novel extension to the sample-efficient, off-policy soft\nactor-critic (SAC) algorithm. We introduce the use of attention networks for\nvariable-length observation processing and a distributed computing architecture\nto achieve high training sample throughput as compared to existing approaches.\nA comprehensive numerical study shows that the proposed framework can ensure\nsafe and efficient separation of aircraft in high density, dynamic environments\nwith various sources of uncertainty.\n","authors":["Marc W. Brittain","Luis E. Alvarez","Kara Breeden"],"pdf_url":"https://arxiv.org/pdf/2308.04958v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2308.04950v1","updated":"2023-08-09T13:33:27Z","published":"2023-08-09T13:33:27Z","title":"Performance Analysis of Transformer Based Models (BERT, ALBERT and\n  RoBERTa) in Fake News Detection","summary":"  Fake news is fake material in a news media format but is not processed\nproperly by news agencies. The fake material can provoke or defame significant\nentities or individuals or potentially even for the personal interests of the\ncreators, causing problems for society. Distinguishing fake news and real news\nis challenging due to limited of domain knowledge and time constraints.\nAccording to the survey, the top three areas most exposed to hoaxes and\nmisinformation by residents are in Banten, DKI Jakarta and West Java. The model\nof transformers is referring to an approach in the field of artificial\nintelligence (AI) in natural language processing utilizing the deep learning\narchitectures. Transformers exercise a powerful attention mechanism to process\ntext in parallel and produce rich and contextual word representations. A\nprevious study indicates a superior performance of a transformer model known as\nBERT over and above non transformer approach. However, some studies suggest the\nperformance can be improved with the use of improved BERT models known as\nALBERT and RoBERTa. However, the modified BERT models are not well explored for\ndetecting fake news in Bahasa Indonesia. In this research, we explore those\ntransformer models and found that ALBERT outperformed other models with 87.6%\naccuracy, 86.9% precision, 86.9% F1-score, and 174.5 run-time (s/epoch)\nrespectively. Source code available at:\nhttps://github.com/Shafna81/fakenewsdetection.git\n","authors":["Shafna Fitria Nur Azizah","Hasan Dwi Cahyono","Sari Widya Sihwi","Wisnu Widiarto"],"pdf_url":"https://arxiv.org/pdf/2308.04950v1.pdf","comment":"6 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.08131v2","updated":"2023-08-09T13:31:12Z","published":"2023-07-16T19:04:48Z","title":"INFLECT-DGNN: Influencer Prediction with Dynamic Graph Neural Networks","summary":"  Leveraging network information for predictive modeling has become widespread\nin many domains. Within the realm of referral and targeted marketing,\ninfluencer detection stands out as an area that could greatly benefit from the\nincorporation of dynamic network representation due to the ongoing development\nof customer-brand relationships. To elaborate this idea, we introduce\nINFLECT-DGNN, a new framework for INFLuencer prEdiCTion with Dynamic Graph\nNeural Networks that combines Graph Neural Networks (GNN) and Recurrent Neural\nNetworks (RNN) with weighted loss functions, the Synthetic Minority\nOversampling TEchnique (SMOTE) adapted for graph data, and a carefully crafted\nrolling-window strategy. To evaluate predictive performance, we utilize a\nunique corporate data set with networks of three cities and derive a\nprofit-driven evaluation methodology for influencer prediction. Our results\nshow how using RNN to encode temporal attributes alongside GNNs significantly\nimproves predictive performance. We compare the results of various models to\ndemonstrate the importance of capturing graph representation, temporal\ndependencies, and using a profit-driven methodology for evaluation.\n","authors":["Elena Tiukhova","Emiliano Penaloza","María Óskarsdóttir","Bart Baesens","Monique Snoeck","Cristián Bravo"],"pdf_url":"https://arxiv.org/pdf/2307.08131v2.pdf","comment":"26 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04947v1","updated":"2023-08-09T13:28:00Z","published":"2023-08-09T13:28:00Z","title":"Methods for Acquiring and Incorporating Knowledge into Stock Price\n  Prediction: A Survey","summary":"  Predicting stock prices presents a challenging research problem due to the\ninherent volatility and non-linear nature of the stock market. In recent years,\nknowledge-enhanced stock price prediction methods have shown groundbreaking\nresults by utilizing external knowledge to understand the stock market. Despite\nthe importance of these methods, there is a scarcity of scholarly works that\nsystematically synthesize previous studies from the perspective of external\nknowledge types. Specifically, the external knowledge can be modeled in\ndifferent data structures, which we group into non-graph-based formats and\ngraph-based formats: 1) non-graph-based knowledge captures contextual\ninformation and multimedia descriptions specifically associated with an\nindividual stock; 2) graph-based knowledge captures interconnected and\ninterdependent information in the stock market. This survey paper aims to\nprovide a systematic and comprehensive description of methods for acquiring\nexternal knowledge from various unstructured data sources and then\nincorporating it into stock price prediction models. We also explore fusion\nmethods for combining external knowledge with historical price features.\nMoreover, this paper includes a compilation of relevant datasets and delves\ninto potential future research directions in this domain.\n","authors":["Liping Wang","Jiawei Li","Lifan Zhao","Zhizhuo Kou","Xiaohan Wang","Xinyi Zhu","Hao Wang","Yanyan Shen","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.04947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04943v1","updated":"2023-08-09T13:18:41Z","published":"2023-08-09T13:18:41Z","title":"Differentially Private Graph Neural Network with Importance-Grained\n  Noise Adaption","summary":"  Graph Neural Networks (GNNs) with differential privacy have been proposed to\npreserve graph privacy when nodes represent personal and sensitive information.\nHowever, the existing methods ignore that nodes with different importance may\nyield diverse privacy demands, which may lead to over-protect some nodes and\ndecrease model utility. In this paper, we study the problem of\nimportance-grained privacy, where nodes contain personal data that need to be\nkept private but are critical for training a GNN. We propose NAP-GNN, a\nnode-importance-grained privacy-preserving GNN algorithm with privacy\nguarantees based on adaptive differential privacy to safeguard node\ninformation. First, we propose a Topology-based Node Importance Estimation\n(TNIE) method to infer unknown node importance with neighborhood and centrality\nawareness. Second, an adaptive private aggregation method is proposed to\nperturb neighborhood aggregation from node-importance-grain. Third, we propose\nto privately train a graph learning algorithm on perturbed aggregations in\nadaptive residual connection mode over multi-layers convolution for node-wise\ntasks. Theoretically analysis shows that NAP-GNN satisfies privacy guarantees.\nEmpirical experiments over real-world graph datasets show that NAP-GNN achieves\na better trade-off between privacy and accuracy.\n","authors":["Yuxin Qi","Xi Lin","Jun Wu"],"pdf_url":"https://arxiv.org/pdf/2308.04943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04938v1","updated":"2023-08-09T13:13:19Z","published":"2023-08-09T13:13:19Z","title":"An In-Depth Analysis of Discretization Methods for Communication\n  Learning using Backpropagation with Multi-Agent Reinforcement Learning","summary":"  Communication is crucial in multi-agent reinforcement learning when agents\nare not able to observe the full state of the environment. The most common\napproach to allow learned communication between agents is the use of a\ndifferentiable communication channel that allows gradients to flow between\nagents as a form of feedback. However, this is challenging when we want to use\ndiscrete messages to reduce the message size, since gradients cannot flow\nthrough a discrete communication channel. Previous work proposed methods to\ndeal with this problem. However, these methods are tested in different\ncommunication learning architectures and environments, making it hard to\ncompare them. In this paper, we compare several state-of-the-art discretization\nmethods as well as a novel approach. We do this comparison in the context of\ncommunication learning using gradients from other agents and perform tests on\nseveral environments. In addition, we present COMA-DIAL, a communication\nlearning approach based on DIAL and COMA extended with learning rate scaling\nand adapted exploration. Using COMA-DIAL allows us to perform experiments on\nmore complex environments. Our results show that the novel ST-DRU method,\nproposed in this paper, achieves the best results out of all discretization\nmethods across the different environments. It achieves the best or close to the\nbest performance in each of the experiments and is the only method that does\nnot fail on any of the tested environments.\n","authors":["Astrid Vanneste","Simon Vanneste","Kevin Mets","Tom De Schepper","Siegfried Mercelis","Peter Hellinckx"],"pdf_url":"https://arxiv.org/pdf/2308.04938v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2204.05669"},{"id":"http://arxiv.org/abs/2308.04934v1","updated":"2023-08-09T13:09:07Z","published":"2023-08-09T13:09:07Z","title":"JEDI: Joint Expert Distillation in a Semi-Supervised Multi-Dataset\n  Student-Teacher Scenario for Video Action Recognition","summary":"  We propose JEDI, a multi-dataset semi-supervised learning method, which\nefficiently combines knowledge from multiple experts, learned on different\ndatasets, to train and improve the performance of individual, per dataset,\nstudent models. Our approach achieves this by addressing two important problems\nin current machine learning research: generalization across datasets and\nlimitations of supervised training due to scarcity of labeled data. We start\nwith an arbitrary number of experts, pretrained on their own specific dataset,\nwhich form the initial set of student models. The teachers are immediately\nderived by concatenating the feature representations from the penultimate\nlayers of the students. We then train all models in a student-teacher\nsemi-supervised learning scenario until convergence. In our efficient approach,\nstudent-teacher training is carried out jointly and end-to-end, showing that\nboth students and teachers improve their generalization capacity during\ntraining. We validate our approach on four video action recognition datasets.\nBy simultaneously considering all datasets within a unified semi-supervised\nsetting, we demonstrate significant improvements over the initial experts.\n","authors":["Lucian Bicsi","Bogdan Alexe","Radu Tudor Ionescu","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2308.04934v1.pdf","comment":"Accepted in ICCV 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.04923v1","updated":"2023-08-09T12:45:18Z","published":"2023-08-09T12:45:18Z","title":"Deep Learning-Based Prediction of Fractional Flow Reserve along the\n  Coronary Artery","summary":"  Functionally significant coronary artery disease (CAD) is caused by plaque\nbuildup in the coronary arteries, potentially leading to narrowing of the\narterial lumen, i.e. coronary stenosis, that significantly obstructs blood flow\nto the myocardium. The current reference for establishing the presence of a\nfunctionally significant stenosis is invasive fractional flow reserve (FFR)\nmeasurement. To avoid invasive measurements, non-invasive prediction of FFR\nfrom coronary CT angiography (CCTA) has emerged. For this, machine learning\napproaches, characterized by fast inference, are increasingly developed.\nHowever, these methods predict a single FFR value per artery i.e. they don't\nprovide information about the stenosis location or treatment strategy. We\npropose a deep learning-based method to predict the FFR along the artery from\nCCTA scans. This study includes CCTA images of 110 patients who underwent\ninvasive FFR pullback measurement in 112 arteries. First, a multi planar\nreconstruction (MPR) of the artery is fed to a variational autoencoder to\ncharacterize the artery, i.e. through the lumen area and unsupervised artery\nencodings. Thereafter, a convolutional neural network (CNN) predicts the FFR\nalong the artery. The CNN is supervised by multiple loss functions, notably a\nloss function inspired by the Earth Mover's Distance (EMD) to predict the\ncorrect location of FFR drops and a histogram-based loss to explicitly\nsupervise the slope of the FFR curve. To train and evaluate our model,\neight-fold cross-validation was performed. The resulting FFR curves show good\nagreement with the reference allowing the distinction between diffuse and focal\nCAD distributions in most cases. Quantitative evaluation yielded a mean\nabsolute difference in the area under the FFR pullback curve (AUPC) of 1.7. The\nmethod may pave the way towards fast, accurate, automatic prediction of FFR\nalong the artery from CCTA.\n","authors":["Nils Hampe","Sanne G. M. van Velzen","Jean-Paul Aben","Carlos Collet","Ivana Išgum"],"pdf_url":"https://arxiv.org/pdf/2308.04923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09417v3","updated":"2023-08-09T12:41:48Z","published":"2023-06-15T18:02:49Z","title":"Diff-TTSG: Denoising probabilistic integrated speech and gesture\n  synthesis","summary":"  With read-aloud speech synthesis achieving high naturalness scores, there is\na growing research interest in synthesising spontaneous speech. However, human\nspontaneous face-to-face conversation has both spoken and non-verbal aspects\n(here, co-speech gestures). Only recently has research begun to explore the\nbenefits of jointly synthesising these two modalities in a single system. The\nprevious state of the art used non-probabilistic methods, which fail to capture\nthe variability of human speech and motion, and risk producing oversmoothing\nartefacts and sub-optimal synthesis quality. We present the first\ndiffusion-based probabilistic model, called Diff-TTSG, that jointly learns to\nsynthesise speech and gestures together. Our method can be trained on small\ndatasets from scratch. Furthermore, we describe a set of careful uni- and\nmulti-modal subjective tests for evaluating integrated speech and gesture\nsynthesis systems, and use them to validate our proposed approach. Please see\nhttps://shivammehta25.github.io/Diff-TTSG/ for video examples, data, and code.\n","authors":["Shivam Mehta","Siyang Wang","Simon Alexanderson","Jonas Beskow","Éva Székely","Gustav Eje Henter"],"pdf_url":"https://arxiv.org/pdf/2306.09417v3.pdf","comment":"7 pages, 2 figures, presented at the ISCA Speech Synthesis Workshop\n  (SSW) 2023"},{"id":"http://arxiv.org/abs/2209.07970v3","updated":"2023-08-09T12:17:49Z","published":"2022-09-16T14:37:09Z","title":"Causal Fourier Analysis on Directed Acyclic Graphs and Posets","summary":"  We present a novel form of Fourier analysis, and associated signal processing\nconcepts, for signals (or data) indexed by edge-weighted directed acyclic\ngraphs (DAGs). This means that our Fourier basis yields an eigendecomposition\nof a suitable notion of shift and convolution operators that we define. DAGs\nare the common model to capture causal relationships between data values and in\nthis case our proposed Fourier analysis relates data with its causes under a\nlinearity assumption that we define. The definition of the Fourier transform\nrequires the transitive closure of the weighted DAG for which several forms are\npossible depending on the interpretation of the edge weights. Examples include\nlevel of influence, distance, or pollution distribution. Our framework is\ndifferent from prior GSP: it is specific to DAGs and leverages, and extends,\nthe classical theory of Moebius inversion from combinatorics. For a\nprototypical application we consider DAGs modeling dynamic networks in which\nedges change over time. Specifically, we model the spread of an infection on\nsuch a DAG obtained from real-world contact tracing data and learn the\ninfection signal from samples assuming sparsity in the Fourier domain.\n","authors":["Bastian Seifert","Chris Wendler","Markus Püschel"],"pdf_url":"https://arxiv.org/pdf/2209.07970v3.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.04905v1","updated":"2023-08-09T12:04:41Z","published":"2023-08-09T12:04:41Z","title":"GraphCC: A Practical Graph Learning-based Approach to Congestion Control\n  in Datacenters","summary":"  Congestion Control (CC) plays a fundamental role in optimizing traffic in\nData Center Networks (DCN). Currently, DCNs mainly implement two main CC\nprotocols: DCTCP and DCQCN. Both protocols -- and their main variants -- are\nbased on Explicit Congestion Notification (ECN), where intermediate switches\nmark packets when they detect congestion. The ECN configuration is thus a\ncrucial aspect on the performance of CC protocols. Nowadays, network experts\nset static ECN parameters carefully selected to optimize the average network\nperformance. However, today's high-speed DCNs experience quick and abrupt\nchanges that severely change the network state (e.g., dynamic traffic\nworkloads, incast events, failures). This leads to under-utilization and\nsub-optimal performance. This paper presents GraphCC, a novel Machine\nLearning-based framework for in-network CC optimization. Our distributed\nsolution relies on a novel combination of Multi-agent Reinforcement Learning\n(MARL) and Graph Neural Networks (GNN), and it is compatible with widely\ndeployed ECN-based CC protocols. GraphCC deploys distributed agents on switches\nthat communicate with their neighbors to cooperate and optimize the global ECN\nconfiguration. In our evaluation, we test the performance of GraphCC under a\nwide variety of scenarios, focusing on the capability of this solution to adapt\nto new scenarios unseen during training (e.g., new traffic workloads, failures,\nupgrades). We compare GraphCC with a state-of-the-art MARL-based solution for\nECN tuning -- ACC -- and observe that our proposed solution outperforms the\nstate-of-the-art baseline in all of the evaluation scenarios, showing\nimprovements up to $20\\%$ in Flow Completion Time as well as significant\nreductions in buffer occupancy ($38.0-85.7\\%$).\n","authors":["Guillermo Bernárdez","José Suárez-Varela","Xiang Shi","Shihan Xiao","Xiangle Cheng","Pere Barlet-Ros","Albert Cabellos-Aparicio"],"pdf_url":"https://arxiv.org/pdf/2308.04905v1.pdf","comment":"11 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.04901v1","updated":"2023-08-09T12:03:12Z","published":"2023-08-09T12:03:12Z","title":"Towards true discovery of the differential equations","summary":"  Differential equation discovery, a machine learning subfield, is used to\ndevelop interpretable models, particularly in nature-related applications. By\nexpertly incorporating the general parametric form of the equation of motion\nand appropriate differential terms, algorithms can autonomously uncover\nequations from data. This paper explores the prerequisites and tools for\nindependent equation discovery without expert input, eliminating the need for\nequation form assumptions. We focus on addressing the challenge of assessing\nthe adequacy of discovered equations when the correct equation is unknown, with\nthe aim of providing insights for reliable equation discovery without prior\nknowledge of the equation form.\n","authors":["Alexander Hvatov","Roman Titov"],"pdf_url":"https://arxiv.org/pdf/2308.04901v1.pdf","comment":"Knowledge and Logical Reasoning in the Era of Data-driven Learning\n  workshop at ICML 2023"},{"id":"http://arxiv.org/abs/2304.13571v2","updated":"2023-08-09T11:41:27Z","published":"2023-04-26T14:09:26Z","title":"Quantum Natural Policy Gradients: Towards Sample-Efficient Reinforcement\n  Learning","summary":"  Reinforcement learning is a growing field in AI with a lot of potential.\nIntelligent behavior is learned automatically through trial and error in\ninteraction with the environment. However, this learning process is often\ncostly. Using variational quantum circuits as function approximators\npotentially can reduce this cost. In order to implement this, we propose the\nquantum natural policy gradient (QNPG) algorithm -- a second-order\ngradient-based routine that takes advantage of an efficient approximation of\nthe quantum Fisher information matrix. We experimentally demonstrate that QNPG\noutperforms first-order based training on Contextual Bandits environments\nregarding convergence speed and stability and moreover reduces the sample\ncomplexity. Furthermore, we provide evidence for the practical feasibility of\nour approach by training on a 12-qubit hardware device.\n","authors":["Nico Meyer","Daniel D. Scherer","Axel Plinge","Christopher Mutschler","Michael J. Hartmann"],"pdf_url":"https://arxiv.org/pdf/2304.13571v2.pdf","comment":"Accepted to the 1st International Workshop on Quantum Machine\n  Learning: From Foundations to Applications (QML@QCE 2023), Bellevue,\n  Washington, USA. 6 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.04887v1","updated":"2023-08-09T11:37:39Z","published":"2023-08-09T11:37:39Z","title":"Targeted and Troublesome: Tracking and Advertising on Children's\n  Websites","summary":"  On the modern web, trackers and advertisers frequently construct and monetize\nusers' detailed behavioral profiles without consent. Despite various studies on\nweb tracking mechanisms and advertisements, there has been no rigorous study\nfocusing on websites targeted at children. To address this gap, we present a\nmeasurement of tracking and (targeted) advertising on websites directed at\nchildren. Motivated by lacking a comprehensive list of child-directed (i.e.,\ntargeted at children) websites, we first build a multilingual classifier based\non web page titles and descriptions. Applying this classifier to over two\nmillion pages, we compile a list of two thousand child-directed websites.\nCrawling these sites from five vantage points, we measure the prevalence of\ntrackers, fingerprinting scripts, and advertisements. Our crawler detects ads\ndisplayed on child-directed websites and determines if ad targeting is enabled\nby scraping ad disclosure pages whenever available. Our results show that\naround 90% of child-directed websites embed one or more trackers, and about 27%\ncontain targeted advertisements--a practice that should require verifiable\nparental consent. Next, we identify improper ads on child-directed websites by\ndeveloping an ML pipeline that processes both images and text extracted from\nads. The pipeline allows us to run semantic similarity queries for arbitrary\nsearch terms, revealing ads that promote services related to dating, weight\nloss, and mental health; as well as ads for sex toys and flirting chat\nservices. Some of these ads feature repulsive and sexually explicit imagery. In\nsummary, our findings indicate a trend of non-compliance with privacy\nregulations and troubling ad safety practices among many advertisers and\nchild-directed websites. To protect children and create a safer online\nenvironment, regulators and stakeholders must adopt and enforce more stringent\nmeasures.\n","authors":["Zahra Moti","Asuman Senol","Hamid Bostani","Frederik Zuiderveen Borgesius","Veelasha Moonsamy","Arunesh Mathur","Gunes Acar"],"pdf_url":"https://arxiv.org/pdf/2308.04887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v2","updated":"2023-08-09T11:33:19Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n  Learner Equation Modeling","summary":"  Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04870v1","updated":"2023-08-09T11:09:14Z","published":"2023-08-09T11:09:14Z","title":"Decorrelating neurons using persistence","summary":"  We propose a novel way to improve the generalisation capacity of deep\nlearning models by reducing high correlations between neurons. For this, we\npresent two regularisation terms computed from the weights of a minimum\nspanning tree of the clique whose vertices are the neurons of a given network\n(or a sample of those), where weights on edges are correlation dissimilarities.\nWe provide an extensive set of experiments to validate the effectiveness of our\nterms, showing that they outperform popular ones. Also, we demonstrate that\nnaive minimisation of all correlations between neurons obtains lower accuracies\nthan our regularisation terms, suggesting that redundancies play a significant\nrole in artificial neural networks, as evidenced by some studies in\nneuroscience for real networks. We include a proof of differentiability of our\nregularisers, thus developing the first effective topological persistence-based\nregularisation terms that consider the whole set of neurons and that can be\napplied to a feedforward architecture in any deep learning task such as\nclassification, data generation, or regression.\n","authors":["Rubén Ballester","Carles Casacuberta","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2308.04870v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.03443v2","updated":"2023-08-09T10:34:54Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu","Laura Forastiere"],"pdf_url":"https://arxiv.org/pdf/2308.03443v2.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2203.00407v3","updated":"2023-08-09T10:10:03Z","published":"2022-02-28T10:54:29Z","title":"Inverse problem for parameters identification in a modified SIRD\n  epidemic model using ensemble neural networks","summary":"  In this paper, we propose a parameter identification methodology of the SIRD\nmodel, an extension of the classical SIR model, that considers the deceased as\na separate category. In addition, our model includes one parameter which is the\nratio between the real total number of infected and the number of infected that\nwere documented in the official statistics.\n  Due to many factors, like governmental decisions, several variants\ncirculating, opening and closing of schools, the typical assumption that the\nparameters of the model stay constant for long periods of time is not\nrealistic. Thus our objective is to create a method which works for short\nperiods of time. In this scope, we approach the estimation relying on the\nprevious 7 days of data and then use the identified parameters to make\npredictions.\n  To perform the estimation of the parameters we propose the average of an\nensemble of neural networks. Each neural network is constructed based on a\ndatabase built by solving the SIRD for 7 days, with random parameters. In this\nway, the networks learn the parameters from the solution of the SIRD model.\n  Lastly we use the ensemble to get estimates of the parameters from the real\ndata of Covid19 in Romania and then we illustrate the predictions for different\nperiods of time, from 10 up to 45 days, for the number of deaths. The main goal\nwas to apply this approach on the analysis of COVID-19 evolution in Romania,\nbut this was also exemplified on other countries like Hungary, Czech Republic\nand Poland with similar results.\n  The results are backed by a theorem which guarantees that we can recover the\nparameters of the model from the reported data. We believe this methodology can\nbe used as a general tool for dealing with short term predictions of infectious\ndiseases or in other compartmental models.\n","authors":["Marian Petrica","Ionel Popescu"],"pdf_url":"https://arxiv.org/pdf/2203.00407v3.pdf","comment":"This is the final version of this paper"},{"id":"http://arxiv.org/abs/2308.04844v1","updated":"2023-08-09T10:08:03Z","published":"2023-08-09T10:08:03Z","title":"Scalability of Message Encoding Techniques for Continuous Communication\n  Learned with Multi-Agent Reinforcement Learning","summary":"  Many multi-agent systems require inter-agent communication to properly\nachieve their goal. By learning the communication protocol alongside the action\nprotocol using multi-agent reinforcement learning techniques, the agents gain\nthe flexibility to determine which information should be shared. However, when\nthe number of agents increases we need to create an encoding of the information\ncontained in these messages. In this paper, we investigate the effect of\nincreasing the amount of information that should be contained in a message and\nincreasing the number of agents. We evaluate these effects on two different\nmessage encoding methods, the mean message encoder and the attention message\nencoder. We perform our experiments on a matrix environment. Surprisingly, our\nresults show that the mean message encoder consistently outperforms the\nattention message encoder. Therefore, we analyse the communication protocol\nused by the agents that use the mean message encoder and can conclude that the\nagents use a combination of an exponential and a logarithmic function in their\ncommunication policy to avoid the loss of important information after applying\nthe mean message encoder.\n","authors":["Astrid Vanneste","Thomas Somers","Simon Vanneste","Kevin Mets","Tom De Schepper","Siegfried Mercelis","Peter Hellinckx"],"pdf_url":"https://arxiv.org/pdf/2308.04844v1.pdf","comment":"Paper accepted to the BNAIC/BeNeLearn 2022 conference"},{"id":"http://arxiv.org/abs/2308.04836v1","updated":"2023-08-09T09:58:42Z","published":"2023-08-09T09:58:42Z","title":"Intrinsic Motivation via Surprise Memory","summary":"  We present a new computing model for intrinsic rewards in reinforcement\nlearning that addresses the limitations of existing surprise-driven\nexplorations. The reward is the novelty of the surprise rather than the\nsurprise norm. We estimate the surprise novelty as retrieval errors of a memory\nnetwork wherein the memory stores and reconstructs surprises. Our surprise\nmemory (SM) augments the capability of surprise-based intrinsic motivators,\nmaintaining the agent's interest in exciting exploration while reducing\nunwanted attraction to unpredictable or noisy observations. Our experiments\ndemonstrate that the SM combined with various surprise predictors exhibits\nefficient exploring behaviors and significantly boosts the final performance in\nsparse reward environments, including Noisy-TV, navigation and challenging\nAtari games.\n","authors":["Hung Le","Kien Do","Dung Nguyen","Svetha Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2308.04836v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.04832v1","updated":"2023-08-09T09:40:34Z","published":"2023-08-09T09:40:34Z","title":"TSSR: A Truncated and Signed Square Root Activation Function for Neural\n  Networks","summary":"  Activation functions are essential components of neural networks. In this\npaper, we introduce a new activation function called the Truncated and Signed\nSquare Root (TSSR) function. This function is distinctive because it is odd,\nnonlinear, monotone and differentiable. Its gradient is continuous and always\npositive. Thanks to these properties, it has the potential to improve the\nnumerical stability of neural networks. Several experiments confirm that the\nproposed TSSR has better performance than other stat-of-the-art activation\nfunctions. The proposed function has significant implications for the\ndevelopment of neural network models and can be applied to a wide range of\napplications in fields such as computer vision, natural language processing,\nand speech recognition.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2308.04832v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16389"},{"id":"http://arxiv.org/abs/2110.02414v2","updated":"2023-08-09T09:29:26Z","published":"2021-10-05T23:38:31Z","title":"Imaginary Hindsight Experience Replay: Curious Model-based Learning for\n  Sparse Reward Tasks","summary":"  Model-based reinforcement learning is a promising learning strategy for\npractical robotic applications due to its improved data-efficiency versus\nmodel-free counterparts. However, current state-of-the-art model-based methods\nrely on shaped reward signals, which can be difficult to design and implement.\nTo remedy this, we propose a simple model-based method tailored for\nsparse-reward multi-goal tasks that foregoes the need for complicated reward\nengineering. This approach, termed Imaginary Hindsight Experience Replay,\nminimises real-world interactions by incorporating imaginary data into policy\nupdates. To improve exploration in the sparse-reward setting, the policy is\ntrained with standard Hindsight Experience Replay and endowed with\ncuriosity-based intrinsic rewards. Upon evaluation, this approach provides an\norder of magnitude increase in data-efficiency on average versus the\nstate-of-the-art model-free method in the benchmark OpenAI Gym Fetch Robotics\ntasks.\n","authors":["Robert McCarthy","Qiang Wang","Stephen J. Redmond"],"pdf_url":"https://arxiv.org/pdf/2110.02414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16186v2","updated":"2023-08-09T09:06:22Z","published":"2023-07-30T09:49:05Z","title":"ESP: Exploiting Symmetry Prior for Multi-Agent Reinforcement Learning","summary":"  Multi-agent reinforcement learning (MARL) has achieved promising results in\nrecent years. However, most existing reinforcement learning methods require a\nlarge amount of data for model training. In addition, data-efficient\nreinforcement learning requires the construction of strong inductive biases,\nwhich are ignored in the current MARL approaches. Inspired by the symmetry\nphenomenon in multi-agent systems, this paper proposes a framework for\nexploiting prior knowledge by integrating data augmentation and a well-designed\nconsistency loss into the existing MARL methods. In addition, the proposed\nframework is model-agnostic and can be applied to most of the current MARL\nalgorithms. Experimental tests on multiple challenging tasks demonstrate the\neffectiveness of the proposed framework. Moreover, the proposed framework is\napplied to a physical multi-robot testbed to show its superiority.\n","authors":["Xin Yu","Rongye Shi","Pu Feng","Yongkai Tian","Jie Luo","Wenjun Wu"],"pdf_url":"https://arxiv.org/pdf/2307.16186v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.02582v3","updated":"2023-08-09T09:03:12Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Harshit Nigam","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v3.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2204.05885v2","updated":"2023-08-09T08:53:54Z","published":"2022-04-12T15:23:12Z","title":"A Hierarchical Block Distance Model for Ultra Low-Dimensional Graph\n  Representations","summary":"  Graph Representation Learning (GRL) has become central for characterizing\nstructures of complex networks and performing tasks such as link prediction,\nnode classification, network reconstruction, and community detection. Whereas\nnumerous generative GRL models have been proposed, many approaches have\nprohibitive computational requirements hampering large-scale network analysis,\nfewer are able to explicitly account for structure emerging at multiple scales,\nand only a few explicitly respect important network properties such as\nhomophily and transitivity. This paper proposes a novel scalable graph\nrepresentation learning method named the Hierarchical Block Distance Model\n(HBDM). The HBDM imposes a multiscale block structure akin to stochastic block\nmodeling (SBM) and accounts for homophily and transitivity by accurately\napproximating the latent distance model (LDM) throughout the inferred\nhierarchy. The HBDM naturally accommodates unipartite, directed, and bipartite\nnetworks whereas the hierarchy is designed to ensure linearithmic time and\nspace complexity enabling the analysis of very large-scale networks. We\nevaluate the performance of the HBDM on massive networks consisting of millions\nof nodes. Importantly, we find that the proposed HBDM framework significantly\noutperforms recent scalable approaches in all considered downstream tasks.\nSurprisingly, we observe superior performance even imposing ultra-low\ntwo-dimensional embeddings facilitating accurate direct and hierarchical-aware\nnetwork visualization and interpretation.\n","authors":["Nikolaos Nakis","Abdulkadir Çelikkanat","Sune Lehmann Jørgensen","Morten Mørup"],"pdf_url":"https://arxiv.org/pdf/2204.05885v2.pdf","comment":"Preprint Version"},{"id":"http://arxiv.org/abs/2307.08621v4","updated":"2023-08-09T08:53:08Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01621v2","updated":"2023-08-09T08:42:24Z","published":"2023-08-03T08:50:48Z","title":"A Novel Convolutional Neural Network Architecture with a Continuous\n  Symmetry","summary":"  This paper introduces a new Convolutional Neural Network (ConvNet)\narchitecture inspired by a class of partial differential equations (PDEs)\ncalled quasi-linear hyperbolic systems. With comparable performance on the\nimage classification task, it allows for the modification of the weights via a\ncontinuous group of symmetry. This is a significant shift from traditional\nmodels where the architecture and weights are essentially fixed. We wish to\npromote the (internal) symmetry as a new desirable property for a neural\nnetwork, and to draw attention to the PDE perspective in analyzing and\ninterpreting ConvNets in the broader Deep Learning community.\n","authors":["Yao Liu","Hang Shao","Bing Bai"],"pdf_url":"https://arxiv.org/pdf/2308.01621v2.pdf","comment":"Accepted by the 3rd CAAI International Conference on Artificial\n  Intelligence (CICAI), 2023"},{"id":"http://arxiv.org/abs/2308.04796v1","updated":"2023-08-09T08:34:46Z","published":"2023-08-09T08:34:46Z","title":"Bayes Risk Consistency of Nonparametric Classification Rules for Spike\n  Trains Data","summary":"  Spike trains data find a growing list of applications in computational\nneuroscience, imaging, streaming data and finance. Machine learning strategies\nfor spike trains are based on various neural network and probabilistic models.\nThe probabilistic approach is relying on parametric or nonparametric\nspecifications of the underlying spike generation model. In this paper we\nconsider the two-class statistical classification problem for a class of spike\ntrain data characterized by nonparametrically specified intensity functions. We\nderive the optimal Bayes rule and next form the plug-in nonparametric kernel\nclassifier. Asymptotical properties of the rules are established including the\nlimit with respect to the increasing recording time interval and the size of a\ntraining set. In particular the convergence of the kernel classifier to the\nBayes rule is proved. The obtained results are supported by a finite sample\nsimulation studies.\n","authors":["Mirosław Pawlak","Mateusz Pabian","Dominik Rzepka"],"pdf_url":"https://arxiv.org/pdf/2308.04796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04791v1","updated":"2023-08-09T08:30:22Z","published":"2023-08-09T08:30:22Z","title":"PETformer: Long-term Time Series Forecasting via Placeholder-enhanced\n  Transformer","summary":"  Recently, Transformer-based models have shown remarkable performance in\nlong-term time series forecasting (LTSF) tasks due to their ability to model\nlong-term dependencies. However, the validity of Transformers for LTSF tasks\nremains debatable, particularly since recent work has shown that simple linear\nmodels can outperform numerous Transformer-based approaches. This suggests that\nthere are limitations to the application of Transformer in LTSF. Therefore,\nthis paper investigates three key issues when applying Transformer to LTSF:\ntemporal continuity, information density, and multi-channel relationships.\nAccordingly, we propose three innovative solutions, including Placeholder\nEnhancement Technique (PET), Long Sub-sequence Division (LSD), and\nMulti-channel Separation and Interaction (MSI), which together form a novel\nmodel called PETformer. These three key designs introduce prior biases suitable\nfor LTSF tasks. Extensive experiments have demonstrated that PETformer achieves\nstate-of-the-art (SOTA) performance on eight commonly used public datasets for\nLTSF, outperforming all other models currently available. This demonstrates\nthat Transformer still possesses powerful capabilities in LTSF.\n","authors":["Shengsheng Lin","Weiwei Lin","Wentai Wu","Songbo Wang","Yongxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.01937v5","updated":"2023-08-09T08:16:47Z","published":"2022-03-03T08:04:59Z","title":"BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray\n  Classification","summary":"  Deep learning methods have shown outstanding classification accuracy in\nmedical imaging problems, which is largely attributed to the availability of\nlarge-scale datasets manually annotated with clean labels. However, given the\nhigh cost of such manual annotation, new medical imaging classification\nproblems may need to rely on machine-generated noisy labels extracted from\nradiology reports. Indeed, many Chest X-ray (CXR) classifiers have already been\nmodelled from datasets with noisy labels, but their training procedure is in\ngeneral not robust to noisy-label samples, leading to sub-optimal models.\nFurthermore, CXR datasets are mostly multi-label, so current noisy-label\nlearning methods designed for multi-class problems cannot be easily adapted. In\nthis paper, we propose a new method designed for the noisy multi-label CXR\nlearning, which detects and smoothly re-labels samples from the dataset, which\nis then used to train common multi-label classifiers. The proposed method\noptimises a bag of multi-label descriptors (BoMD) to promote their similarity\nwith the semantic descriptors produced by BERT models from the multi-label\nimage annotation. Our experiments on diverse noisy multi-label training sets\nand clean testing sets show that our model has state-of-the-art accuracy and\nrobustness in many CXR multi-label classification benchmarks.\n","authors":["Yuanhong Chen","Fengbei Liu","Hu Wang","Chong Wang","Yu Tian","Yuyuan Liu","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2203.01937v5.pdf","comment":"Code is available at https://github.com/cyh-0/BoMD"},{"id":"http://arxiv.org/abs/2308.04771v1","updated":"2023-08-09T07:58:33Z","published":"2023-08-09T07:58:33Z","title":"SUnAA: Sparse Unmixing using Archetypal Analysis","summary":"  This paper introduces a new sparse unmixing technique using archetypal\nanalysis (SUnAA). First, we design a new model based on archetypal analysis. We\nassume that the endmembers of interest are a convex combination of endmembers\nprovided by a spectral library and that the number of endmembers of interest is\nknown. Then, we propose a minimization problem. Unlike most conventional sparse\nunmixing methods, here the minimization problem is non-convex. We minimize the\noptimization objective iteratively using an active set algorithm. Our method is\nrobust to the initialization and only requires the number of endmembers of\ninterest. SUnAA is evaluated using two simulated datasets for which results\nconfirm its better performance over other conventional and advanced techniques\nin terms of signal-to-reconstruction error. SUnAA is also applied to Cuprite\ndataset and the results are compared visually with the available geological map\nprovided for this dataset. The qualitative assessment demonstrates the\nsuccessful estimation of the minerals abundances and significantly improves the\ndetection of dominant minerals compared to the conventional regression-based\nsparse unmixing methods. The Python implementation of SUnAA can be found at:\nhttps://github.com/BehnoodRasti/SUnAA.\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2308.04771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09633v5","updated":"2023-08-09T07:51:45Z","published":"2023-06-16T05:32:24Z","title":"The False Dawn: Reevaluating Google's Reinforcement Learning for Chip\n  Macro Placement","summary":"  Reinforcement learning (RL) for physical design of silicon chips in a Google\n2021 Nature paper stirred controversy due to poorly documented claims that\nraised eyebrows and attracted critical media coverage. The Nature paper\nwithheld most inputs needed to produce reported results and some critical steps\nin the methodology. But two separate evaluations filled in the gaps and\ndemonstrated that Google RL lags behind human designers, behind a well-known\nalgorithm (Simulated Annealing), and also behind generally-available commercial\nsoftware, while taking longer to run. Crosschecked data show that the integrity\nof the Nature paper is substantially undermined owing to errors in conduct,\nanalysis and reporting. Before publishing, Google rebuffed internal allegations\nof fraud.\n","authors":["Igor L. Markov"],"pdf_url":"https://arxiv.org/pdf/2306.09633v5.pdf","comment":"14 pages, 1 figure, 4 tables (new material and refs in Section 8,\n  quotes from new court ruling in new Table 4, better wording in Sections 1-4)"},{"id":"http://arxiv.org/abs/2308.04762v1","updated":"2023-08-09T07:51:07Z","published":"2023-08-09T07:51:07Z","title":"Tram-FL: Routing-based Model Training for Decentralized Federated\n  Learning","summary":"  In decentralized federated learning (DFL), substantial traffic from frequent\ninter-node communication and non-independent and identically distributed\n(non-IID) data challenges high-accuracy model acquisition. We propose Tram-FL,\na novel DFL method, which progressively refines a global model by transferring\nit sequentially amongst nodes, rather than by exchanging and aggregating local\nmodels. We also introduce a dynamic model routing algorithm for optimal route\nselection, aimed at enhancing model precision with minimal forwarding. Our\nexperiments using MNIST, CIFAR-10, and IMDb datasets demonstrate that Tram-FL\nwith the proposed routing delivers high model accuracy under non-IID\nconditions, outperforming baselines while reducing communication costs.\n","authors":["Kota Maejima","Takayuki Nishio","Asato Yamazaki","Yuko Hara-Azumi"],"pdf_url":"https://arxiv.org/pdf/2308.04762v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2308.04761v1","updated":"2023-08-09T07:49:39Z","published":"2023-08-09T07:49:39Z","title":"Feature Matching Data Synthesis for Non-IID Federated Learning","summary":"  Federated learning (FL) has emerged as a privacy-preserving paradigm that\ntrains neural networks on edge devices without collecting data at a central\nserver. However, FL encounters an inherent challenge in dealing with\nnon-independent and identically distributed (non-IID) data among devices. To\naddress this challenge, this paper proposes a hard feature matching data\nsynthesis (HFMDS) method to share auxiliary data besides local models.\nSpecifically, synthetic data are generated by learning the essential\nclass-relevant features of real samples and discarding the redundant features,\nwhich helps to effectively tackle the non-IID issue. For better privacy\npreservation, we propose a hard feature augmentation method to transfer real\nfeatures towards the decision boundary, with which the synthetic data not only\nimprove the model generalization but also erase the information of real\nfeatures. By integrating the proposed HFMDS method with FL, we present a novel\nFL framework with data augmentation to relieve data heterogeneity. The\ntheoretical analysis highlights the effectiveness of our proposed data\nsynthesis method in solving the non-IID challenge. Simulation results further\ndemonstrate that our proposed HFMDS-FL algorithm outperforms the baselines in\nterms of accuracy, privacy preservation, and computational cost on various\nbenchmark datasets.\n","authors":["Zijian Li","Yuchang Sun","Jiawei Shao","Yuyi Mao","Jessie Hui Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04761v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.04755v1","updated":"2023-08-09T07:47:12Z","published":"2023-08-09T07:47:12Z","title":"Collaborative Learning From Distributed Data With Differentially Private\n  Synthetic Twin Data","summary":"  Consider a setting where multiple parties holding sensitive data aim to\ncollaboratively learn population level statistics, but pooling the sensitive\ndata sets is not possible. We propose a framework in which each party shares a\ndifferentially private synthetic twin of their data. We study the feasibility\nof combining such synthetic twin data sets for collaborative learning on\nreal-world health data from the UK Biobank. We discover that parties engaging\nin the collaborative learning via shared synthetic data obtain more accurate\nestimates of target statistics compared to using only their local data. This\nfinding extends to the difficult case of small heterogeneous data sets.\nFurthermore, the more parties participate, the larger and more consistent the\nimprovements become. Finally, we find that data sharing can especially help\nparties whose data contain underrepresented groups to perform better-adjusted\nanalysis for said groups. Based on our results we conclude that sharing of\nsynthetic twins is a viable method for enabling learning from sensitive data\nwithout violating privacy constraints even if individual data sets are small or\ndo not represent the overall population well. The setting of distributed\nsensitive data is often a bottleneck in biomedical research, which our study\nshows can be alleviated with privacy-preserving collaborative learning methods.\n","authors":["Lukas Prediger","Joonas Jälkö","Antti Honkela","Samuel Kaski"],"pdf_url":"https://arxiv.org/pdf/2308.04755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06713v3","updated":"2023-08-09T07:40:43Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne","Luciana Ferrer","Matías Vera","Pablo Piantanida"],"pdf_url":"https://arxiv.org/pdf/2307.06713v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04748v1","updated":"2023-08-09T07:36:21Z","published":"2023-08-09T07:36:21Z","title":"Universal Fuzzing via Large Language Models","summary":"  Fuzzing has achieved tremendous success in discovering bugs and\nvulnerabilities in various software systems. Systems under test (SUTs) that\ntake in programming or formal language as inputs, e.g., compilers, runtime\nengines, constraint solvers, and software libraries with accessible APIs, are\nespecially important as they are fundamental building blocks of software\ndevelopment. However, existing fuzzers for such systems often target a specific\nlanguage, and thus cannot be easily applied to other languages or even other\nversions of the same language. Moreover, the inputs generated by existing\nfuzzers are often limited to specific features of the input language, and thus\ncan hardly reveal bugs related to other or new features. This paper presents\nFuzz4All, the first fuzzer that is universal in the sense that it can target\nmany different input languages and many different features of these languages.\nThe key idea behind Fuzz4All is to leverage large language models (LLMs) as an\ninput generation and mutation engine, which enables the approach to produce\ndiverse and realistic inputs for any practically relevant language. To realize\nthis potential, we present a novel autoprompting technique, which creates LLM\nprompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop,\nwhich iteratively updates the prompt to create new fuzzing inputs. We evaluate\nFuzz4All on nine systems under test that take in six different languages (C,\nC++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six\nlanguages, that universal fuzzing achieves higher coverage than existing,\nlanguage-specific fuzzers. Furthermore, Fuzz4All has identified 76 bugs in\nwidely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit\nquantum computing platform, with 47 bugs already confirmed by developers as\npreviously unknown.\n","authors":["Chunqiu Steven Xia","Matteo Paltenghi","Jia Le Tian","Michael Pradel","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14797v4","updated":"2023-08-09T07:34:03Z","published":"2022-06-29T17:56:03Z","title":"3D-Aware Video Generation","summary":"  Generative models have emerged as an essential building block for many image\nsynthesis and editing tasks. Recent advances in this field have also enabled\nhigh-quality 3D or video content to be generated that exhibits either\nmulti-view or temporal consistency. With our work, we explore 4D generative\nadversarial networks (GANs) that learn unconditional generation of 3D-aware\nvideos. By combining neural implicit representations with time-aware\ndiscriminator, we develop a GAN framework that synthesizes 3D video supervised\nonly with monocular videos. We show that our method learns a rich embedding of\ndecomposable 3D structures and motions that enables new visual effects of\nspatio-temporal renderings while producing imagery with quality comparable to\nthat of existing 3D or video GANs.\n","authors":["Sherwin Bahmani","Jeong Joon Park","Despoina Paschalidou","Hao Tang","Gordon Wetzstein","Leonidas Guibas","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2206.14797v4.pdf","comment":"TMLR 2023; Project page: https://sherwinbahmani.github.io/3dvidgen"},{"id":"http://arxiv.org/abs/2211.02408v3","updated":"2023-08-09T07:29:57Z","published":"2022-11-04T12:36:36Z","title":"Rickrolling the Artist: Injecting Backdoors into Text Encoders for\n  Text-to-Image Synthesis","summary":"  While text-to-image synthesis currently enjoys great popularity among\nresearchers and the general public, the security of these models has been\nneglected so far. Many text-guided image generation models rely on pre-trained\ntext encoders from external sources, and their users trust that the retrieved\nmodels will behave as promised. Unfortunately, this might not be the case. We\nintroduce backdoor attacks against text-guided generative models and\ndemonstrate that their text encoders pose a major tampering risk. Our attacks\nonly slightly alter an encoder so that no suspicious model behavior is apparent\nfor image generations with clean prompts. By then inserting a single character\ntrigger into the prompt, e.g., a non-Latin character or emoji, the adversary\ncan trigger the model to either generate images with pre-defined attributes or\nimages following a hidden, potentially malicious description. We empirically\ndemonstrate the high effectiveness of our attacks on Stable Diffusion and\nhighlight that the injection process of a single backdoor takes less than two\nminutes. Besides phrasing our approach solely as an attack, it can also force\nan encoder to forget phrases related to certain concepts, such as nudity or\nviolence, and help to make image generation safer.\n","authors":["Lukas Struppek","Dominik Hintersdorf","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2211.02408v3.pdf","comment":"Published as a conference paper at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.17805v2","updated":"2023-08-09T07:24:44Z","published":"2023-03-31T05:32:11Z","title":"On the Effect of Initialization: The Scaling Path of 2-Layer Neural\n  Networks","summary":"  In supervised learning, the regularization path is sometimes used as a\nconvenient theoretical proxy for the optimization path of gradient descent\ninitialized from zero. In this paper, we study a modification of the\nregularization path for infinite-width 2-layer ReLU neural networks with\nnonzero initial distribution of the weights at different scales. By exploiting\na link with unbalanced optimal-transport theory, we show that, despite the\nnon-convexity of the 2-layer network training, this problem admits an\ninfinite-dimensional convex counterpart. We formulate the corresponding\nfunctional-optimization problem and investigate its main properties. In\nparticular, we show that, as the scale of the initialization ranges between $0$\nand $+\\infty$, the associated path interpolates continuously between the\nso-called kernel and rich regimes. Numerical experiments confirm that, in our\nsetting, the scaling path and the final states of the optimization path behave\nsimilarly, even beyond these extreme points.\n","authors":["Sebastian Neumayer","Lénaïc Chizat","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2303.17805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02898v2","updated":"2023-08-09T07:24:25Z","published":"2023-03-06T05:23:44Z","title":"Stabilizing the Maximal Entropy Moment Method for Rarefied Gas Dynamics\n  at Single-Precision","summary":"  Developing extended hydrodynamics equations valid for both dense and rarefied\ngases remains a great challenge. A systematical solution for this challenge is\nthe moment method describing both dense and rarefied gas behaviors with moments\nof gas molecule velocity distributions. Among moment methods, the maximal\nentropy moment method (MEM) stands out for its well-posedness and stability,\nwhich utilizes velocity distributions with maximized entropy. However, finding\nsuch distributions requires solving an ill-conditioned and\ncomputation-demanding optimization problem. This problem causes numerical\noverflow and breakdown when the numerical precision is insufficient, especially\nfor flows like high-speed shock waves. It also prevents modern GPUs from\naccelerating optimization with their enormous single floating-point precision\ncomputation power. This paper aims to stabilize MEM, making it practical for\nsimulating very strong normal shock waves on modern GPUs at single precision.\nWe propose the gauge transformations for MEM, making the optimization less\nill-conditioned. We also tackle numerical overflow and breakdown by adopting\nthe canonical form of distribution and Newton's modified optimization method.\nWith these techniques, we achieved a single-precision GPU simulation of a Mach\n10 shock wave with 35 moments MEM, surpassing the previous double-precision\nresults of Mach 4. Moreover, we argued that over-refined spatial mesh degrades\nboth the accuracy and stability of MEM. Overall, this paper makes the maximal\nentropy moment method practical for simulating very strong normal shock waves\non modern GPUs at single-precision, with significant stability improvement\ncompared to previous methods.\n","authors":["Candi Zheng","Wang Yang","Shiyi Chen"],"pdf_url":"https://arxiv.org/pdf/2303.02898v2.pdf","comment":"54 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.01097v2","updated":"2023-08-09T07:21:53Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04739v1","updated":"2023-08-09T07:11:42Z","published":"2023-08-09T07:11:42Z","title":"Optimizing a Transformer-based network for a deep learning seismic\n  processing workflow","summary":"  StorSeismic is a recently introduced model based on the Transformer to adapt\nto various seismic processing tasks through its pretraining and fine-tuning\ntraining strategy. In the original implementation, StorSeismic utilized a\nsinusoidal positional encoding and a conventional self-attention mechanism,\nboth borrowed from the natural language processing (NLP) applications. For\nseismic processing they admitted good results, but also hinted to limitations\nin efficiency and expressiveness. We propose modifications to these two key\ncomponents, by utilizing relative positional encoding and low-rank attention\nmatrices as replacements to the vanilla ones. The proposed changes are tested\non processing tasks applied to a realistic Marmousi and offshore field data as\na sequential strategy, starting from denoising, direct arrival removal,\nmultiple attenuation, and finally root-mean-squared velocity ($V_{RMS}$)\nprediction for normal moveout (NMO) correction. We observe faster pretraining\nand competitive results on the fine-tuning tasks and, additionally, fewer\nparameters to train compared to the vanilla model.\n","authors":["Randy Harsuko","Tariq Alkhalifah"],"pdf_url":"https://arxiv.org/pdf/2308.04739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04735v1","updated":"2023-08-09T07:00:59Z","published":"2023-08-09T07:00:59Z","title":"Going Deeper with Five-point Stencil Convolutions for Reaction-Diffusion\n  Equations","summary":"  Physics-informed neural networks have been widely applied to partial\ndifferential equations with great success because the physics-informed loss\nessentially requires no observations or discretization. However, it is\ndifficult to optimize model parameters, and these parameters must be trained\nfor each distinct initial condition. To overcome these challenges in\nsecond-order reaction-diffusion type equations, a possible way is to use\nfive-point stencil convolutional neural networks (FCNNs). FCNNs are trained\nusing two consecutive snapshots, where the time step corresponds to the step\nsize of the given snapshots. Thus, the time evolution of FCNNs depends on the\ntime step, and the time step must satisfy its CFL condition to avoid blow-up\nsolutions. In this work, we propose deep FCNNs that have large receptive fields\nto predict time evolutions with a time step larger than the threshold of the\nCFL condition. To evaluate our models, we consider the heat, Fisher's, and\nAllen-Cahn equations with diverse initial conditions. We demonstrate that deep\nFCNNs retain certain accuracies, in contrast to FDMs that blow up.\n","authors":["Yongho Kim","Yongho Choi"],"pdf_url":"https://arxiv.org/pdf/2308.04735v1.pdf","comment":"1 table, 6 figures"},{"id":"http://arxiv.org/abs/2308.03210v2","updated":"2023-08-09T06:39:29Z","published":"2023-08-06T21:10:30Z","title":"Time-Parameterized Convolutional Neural Networks for Irregularly Sampled\n  Time Series","summary":"  Irregularly sampled multivariate time series are ubiquitous in several\napplication domains, leading to sparse, not fully-observed and non-aligned\nobservations across different variables. Standard sequential neural network\narchitectures, such as recurrent neural networks (RNNs) and convolutional\nneural networks (CNNs), consider regular spacing between observation times,\nposing significant challenges to irregular time series modeling. While most of\nthe proposed architectures incorporate RNN variants to handle irregular time\nintervals, convolutional neural networks have not been adequately studied in\nthe irregular sampling setting. In this paper, we parameterize convolutional\nlayers by employing time-explicitly initialized kernels. Such general functions\nof time enhance the learning process of continuous-time hidden dynamics and can\nbe efficiently incorporated into convolutional kernel weights. We, thus,\npropose the time-parameterized convolutional neural network (TPCNN), which\nshares similar properties with vanilla convolutions but is carefully designed\nfor irregularly sampled time series. We evaluate TPCNN on both interpolation\nand classification tasks involving real-world irregularly sampled multivariate\ntime series datasets. Our experimental results indicate the competitive\nperformance of the proposed TPCNN model which is also significantly more\nefficient than other state-of-the-art methods. At the same time, the proposed\narchitecture allows the interpretability of the input series by leveraging the\ncombination of learnable time functions that improve the network performance in\nsubsequent tasks and expedite the inaugural application of convolutions in this\nfield.\n","authors":["Chrysoula Kosma","Giannis Nikolentzos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2308.03210v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04729v1","updated":"2023-08-09T06:27:24Z","published":"2023-08-09T06:27:24Z","title":"JEN-1: Text-Guided Universal Music Generation with Omnidirectional\n  Diffusion Models","summary":"  Music generation has attracted growing interest with the advancement of deep\ngenerative models. However, generating music conditioned on textual\ndescriptions, known as text-to-music, remains challenging due to the complexity\nof musical structures and high sampling rate requirements. Despite the task's\nsignificance, prevailing generative models exhibit limitations in music\nquality, computational efficiency, and generalization. This paper introduces\nJEN-1, a universal high-fidelity model for text-to-music generation. JEN-1 is a\ndiffusion model incorporating both autoregressive and non-autoregressive\ntraining. Through in-context learning, JEN-1 performs various generation tasks\nincluding text-guided music generation, music inpainting, and continuation.\nEvaluations demonstrate JEN-1's superior performance over state-of-the-art\nmethods in text-music alignment and music quality while maintaining\ncomputational efficiency. Our demos are available at\nhttp://futureverse.com/research/jen/demos/jen1\n","authors":["Peike Li","Boyu Chen","Yao Yao","Yikai Wang","Allen Wang","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03210v2","updated":"2023-08-09T06:24:55Z","published":"2023-05-04T23:46:49Z","title":"AttentionViz: A Global View of Transformer Attention","summary":"  Transformer models are revolutionizing machine learning, but their inner\nworkings remain mysterious. In this work, we present a new visualization\ntechnique designed to help researchers understand the self-attention mechanism\nin transformers that allows these models to learn rich, contextual\nrelationships between elements of a sequence. The main idea behind our method\nis to visualize a joint embedding of the query and key vectors used by\ntransformer models to compute attention. Unlike previous attention\nvisualization techniques, our approach enables the analysis of global patterns\nacross multiple input sequences. We create an interactive visualization tool,\nAttentionViz (demo: http://attentionviz.com), based on these joint query-key\nembeddings, and use it to study attention mechanisms in both language and\nvision transformers. We demonstrate the utility of our approach in improving\nmodel understanding and offering new insights about query-key interactions\nthrough several application scenarios and expert feedback.\n","authors":["Catherine Yeh","Yida Chen","Aoyu Wu","Cynthia Chen","Fernanda Viégas","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2305.03210v2.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.01011v3","updated":"2023-08-09T06:05:09Z","published":"2023-08-02T08:37:45Z","title":"Enhancing Representation Learning for Periodic Time Series with Floss: A\n  Frequency Domain Regularization Approach","summary":"  Time series analysis is a fundamental task in various application domains,\nand deep learning approaches have demonstrated remarkable performance in this\narea. However, many real-world time series data exhibit significant periodic or\nquasi-periodic dynamics that are often not adequately captured by existing deep\nlearning-based solutions. This results in an incomplete representation of the\nunderlying dynamic behaviors of interest. To address this gap, we propose an\nunsupervised method called Floss that automatically regularizes learned\nrepresentations in the frequency domain. The Floss method first automatically\ndetects major periodicities from the time series. It then employs periodic\nshift and spectral density similarity measures to learn meaningful\nrepresentations with periodic consistency. In addition, Floss can be easily\nincorporated into both supervised, semi-supervised, and unsupervised learning\nframeworks. We conduct extensive experiments on common time series\nclassification, forecasting, and anomaly detection tasks to demonstrate the\neffectiveness of Floss. We incorporate Floss into several representative deep\nlearning solutions to justify our design choices and demonstrate that it is\ncapable of automatically discovering periodic dynamics and improving\nstate-of-the-art deep learning models.\n","authors":["Chunwei Yang","Xiaoxu Chen","Lijun Sun","Hongyu Yang","Yuankai Wu"],"pdf_url":"https://arxiv.org/pdf/2308.01011v3.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.02182v2","updated":"2023-08-09T05:50:29Z","published":"2023-08-04T07:54:45Z","title":"AutoML4ETC: Automated Neural Architecture Search for Real-World\n  Encrypted Traffic Classification","summary":"  Deep learning (DL) has been successfully applied to encrypted network traffic\nclassification in experimental settings. However, in production use, it has\nbeen shown that a DL classifier's performance inevitably decays over time.\nRe-training the model on newer datasets has been shown to only partially\nimprove its performance. Manually re-tuning the model architecture to meet the\nperformance expectations on newer datasets is time-consuming and requires\ndomain expertise. We propose AutoML4ETC, a novel tool to automatically design\nefficient and high-performing neural architectures for encrypted traffic\nclassification. We define a novel, powerful search space tailored specifically\nfor the near real-time classification of encrypted traffic using packet header\nbytes. We show that with different search strategies over our search space,\nAutoML4ETC generates neural architectures that outperform the state-of-the-art\nencrypted traffic classifiers on several datasets, including public benchmark\ndatasets and real-world TLS and QUIC traffic collected from the Orange mobile\nnetwork. In addition to being more accurate, AutoML4ETC's architectures are\nsignificantly more efficient and lighter in terms of the number of parameters.\nFinally, we make AutoML4ETC publicly available for future research.\n","authors":["Navid Malekghaini","Elham Akbari","Mohammad A. Salahuddin","Noura Limam","Raouf Boutaba","Bertrand Mathieu","Stephanie Moteau","Stephane Tuffin"],"pdf_url":"https://arxiv.org/pdf/2308.02182v2.pdf","comment":"A slightly different version is under review for possible publication\n  in IEEE TNSM journal"},{"id":"http://arxiv.org/abs/2303.16839v3","updated":"2023-08-09T05:39:34Z","published":"2023-03-29T16:42:30Z","title":"MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks","summary":"  The development of language models have moved from encoder-decoder to\ndecoder-only designs. In addition, we observe that the two most popular\nmultimodal tasks, the generative and contrastive tasks, are nontrivial to\naccommodate in one architecture, and further need adaptations for downstream\ntasks. We propose a novel paradigm of training with a decoder-only model for\nmultimodal tasks, which is surprisingly effective in jointly learning of these\ndisparate vision-language tasks. This is done with a simple model, called\nMaMMUT. It consists of a single vision encoder and a text decoder, and is able\nto accommodate contrastive and generative learning by a novel two-pass approach\non the text decoder. We demonstrate that joint learning of these diverse\nobjectives is simple, effective, and maximizes the weight-sharing of the model\nacross these tasks. Furthermore, the same architecture enables straightforward\nextensions to open-vocabulary object detection and video-language tasks. The\nmodel tackles a diverse range of tasks, while being modest in capacity. Our\nmodel achieves the state of the art on image-text and text-image retrieval,\nvideo question answering and open-vocabulary detection tasks, outperforming\nmuch larger and more extensively trained foundational models. It shows very\ncompetitive results on VQA and Video Captioning, especially considering its\ncapacity. Ablations confirm the flexibility and advantages of our approach.\n","authors":["Weicheng Kuo","AJ Piergiovanni","Dahun Kim","Xiyang Luo","Ben Caine","Wei Li","Abhijit Ogale","Luowei Zhou","Andrew Dai","Zhifeng Chen","Claire Cui","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2303.16839v3.pdf","comment":"Published in Transactions on Machine Learning Research (\n  https://jmlr.org/tmlr/ ). 18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.06108v3","updated":"2023-08-09T05:36:43Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n  Detection System","summary":"  In autonomous driving systems, LiDAR and radar play important roles in the\nperception of the surrounding environment. LiDAR provides accurate 3D spatial\nsensing information but cannot work in adverse weather like fog. On the other\nhand, the radar signal can be diffracted when encountering raindrops or mist\nparticles thanks to its wavelength, but it suffers from large noise. Recent\nstate-of-the-art works reveal that fusion of radar and LiDAR can lead to robust\ndetection in adverse weather. The existing works adopt convolutional neural\nnetwork architecture to extract features from each sensor data stream, then\nalign and aggregate the two branch features to predict object detection\nresults. However, these methods have low accuracy of bounding box estimations\ndue to a simple design of label assignment and fusion strategies. In this\npaper, we propose a bird's-eye view fusion learning-based anchor box-free\nobject detection system, which fuses the feature derived from the radar\nrange-azimuth heatmap and the LiDAR point cloud to estimate the possible\nobjects. Different label assignment strategies have been designed to facilitate\nthe consistency between the classification of foreground or background anchor\npoints and the corresponding bounding box regressions. In addition, the\nperformance of the proposed object detector is further enhanced by employing a\nnovel interactive transformer module. The superior performance of the methods\nproposed in this paper has been demonstrated using the recently published\nOxford Radar RobotCar dataset. Our system's average precision significantly\noutperforms the best state-of-the-art method by 13.1% and 19.0% at IoU of 0.8\nunder 'Clear+Foggy' training conditions for 'Clear' and 'Foggy' testing,\nrespectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v3.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12306v3","updated":"2023-08-09T05:19:06Z","published":"2023-07-23T12:18:12Z","title":"Tackling the Curse of Dimensionality with Physics-Informed Neural\n  Networks","summary":"  The curse-of-dimensionality (CoD) taxes computational resources heavily with\nexponentially increasing computational cost as the dimension increases. This\nposes great challenges in solving high-dimensional PDEs as Richard Bellman\nfirst pointed out over 60 years ago. While there has been some recent success\nin solving numerically partial differential equations (PDEs) in high\ndimensions, such computations are prohibitively expensive, and true scaling of\ngeneral nonlinear PDEs to high dimensions has never been achieved. In this\npaper, we develop a new method of scaling up physics-informed neural networks\n(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called\nStochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs\ninto pieces corresponding to different dimensions and samples randomly a subset\nof these dimensional pieces in each iteration of training PINNs. We\ntheoretically prove the convergence guarantee and other desired properties of\nthe proposed method. We experimentally demonstrate that the proposed method\nallows us to solve many notoriously hard high-dimensional PDEs, including the\nHamilton-Jacobi-Bellman (HJB) and the Schr\\\"{o}dinger equations in thousands of\ndimensions very fast on a single GPU using the PINNs mesh-free approach. For\ninstance, we solve nontrivial nonlinear PDEs (one HJB equation and one\nBlack-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using\nSDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD\ncan be applied to any current and future variants of PINNs to scale them up for\narbitrary high-dimensional PDEs.\n","authors":["Zheyuan Hu","Khemraj Shukla","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.12306v3.pdf","comment":"37 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.04712v1","updated":"2023-08-09T05:08:57Z","published":"2023-08-09T05:08:57Z","title":"Slot Induction via Pre-trained Language Model Probing and Multi-level\n  Contrastive Learning","summary":"  Recent advanced methods in Natural Language Understanding for Task-oriented\nDialogue (TOD) Systems (e.g., intent detection and slot filling) require a\nlarge amount of annotated data to achieve competitive performance. In reality,\ntoken-level annotations (slot labels) are time-consuming and difficult to\nacquire. In this work, we study the Slot Induction (SI) task whose objective is\nto induce slot boundaries without explicit knowledge of token-level slot\nannotations. We propose leveraging Unsupervised Pre-trained Language Model\n(PLM) Probing and Contrastive Learning mechanism to exploit (1) unsupervised\nsemantic knowledge extracted from PLM, and (2) additional sentence-level intent\nlabel signals available from TOD. Our approach is shown to be effective in SI\ntask and capable of bridging the gaps with token-level supervised models on two\nNLU benchmark datasets. When generalized to emerging intents, our SI objectives\nalso provide enhanced slot label representations, leading to improved\nperformance on the Slot Filling tasks.\n","authors":["Hoang H. Nguyen","Chenwei Zhang","Ye Liu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.04712v1.pdf","comment":"Accepted at SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2308.04708v1","updated":"2023-08-09T04:59:06Z","published":"2023-08-09T04:59:06Z","title":"Generative Perturbation Analysis for Probabilistic Black-Box Anomaly\n  Attribution","summary":"  We address the task of probabilistic anomaly attribution in the black-box\nregression setting, where the goal is to compute the probability distribution\nof the attribution score of each input variable, given an observed anomaly. The\ntraining dataset is assumed to be unavailable. This task differs from the\nstandard XAI (explainable AI) scenario, since we wish to explain the anomalous\ndeviation from a black-box prediction rather than the black-box model itself.\n  We begin by showing that mainstream model-agnostic explanation methods, such\nas the Shapley values, are not suitable for this task because of their\n``deviation-agnostic property.'' We then propose a novel framework for\nprobabilistic anomaly attribution that allows us to not only compute\nattribution scores as the predictive mean but also quantify the uncertainty of\nthose scores. This is done by considering a generative process for\nperturbations that counter-factually bring the observed anomalous observation\nback to normalcy. We introduce a variational Bayes algorithm for deriving the\ndistributions of per variable attribution scores. To the best of our knowledge,\nthis is the first probabilistic anomaly attribution framework that is free from\nbeing deviation-agnostic.\n","authors":["Tsuyoshi Idé","Naoki Abe"],"pdf_url":"https://arxiv.org/pdf/2308.04708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04706v1","updated":"2023-08-09T04:57:56Z","published":"2023-08-09T04:57:56Z","title":"Pareto Invariant Representation Learning for Multimedia Recommendation","summary":"  Multimedia recommendation involves personalized ranking tasks, where\nmultimedia content is usually represented using a generic encoder. However,\nthese generic representations introduce spurious correlations that fail to\nreveal users' true preferences. Existing works attempt to alleviate this\nproblem by learning invariant representations, but overlook the balance between\nindependent and identically distributed (IID) and out-of-distribution (OOD)\ngeneralization. In this paper, we propose a framework called Pareto Invariant\nRepresentation Learning (PaInvRL) to mitigate the impact of spurious\ncorrelations from an IID-OOD multi-objective optimization perspective, by\nlearning invariant representations (intrinsic factors that attract user\nattention) and variant representations (other factors) simultaneously.\nSpecifically, PaInvRL includes three iteratively executed modules: (i)\nheterogeneous identification module, which identifies the heterogeneous\nenvironments to reflect distributional shifts for user-item interactions; (ii)\ninvariant mask generation module, which learns invariant masks based on the\nPareto-optimal solutions that minimize the adaptive weighted Invariant Risk\nMinimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which\ngenerates both variant representations and item-invariant representations for\ntraining a multi-modal recommendation model that mitigates spurious\ncorrelations and balances the generalization performance within and cross the\nenvironmental distributions. We compare the proposed PaInvRL with\nstate-of-the-art recommendation models on three public multimedia\nrecommendation datasets (Movielens, Tiktok, and Kwai), and the experimental\nresults validate the effectiveness of PaInvRL for both within- and\ncross-environmental learning.\n","authors":["Shanshan Huang","Haoxuan Li","Qingsong Li","Chunyuan Zheng","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04706v1.pdf","comment":"ACM MM 2023 full paper"},{"id":"http://arxiv.org/abs/2307.14565v2","updated":"2023-08-09T04:53:52Z","published":"2023-07-27T00:55:54Z","title":"Auto-Tables: Synthesizing Multi-Step Transformations to Relationalize\n  Tables without Using Examples","summary":"  Relational tables, where each row corresponds to an entity and each column\ncorresponds to an attribute, have been the standard for tables in relational\ndatabases. However, such a standard cannot be taken for granted when dealing\nwith tables \"in the wild\". Our survey of real spreadsheet-tables and web-tables\nshows that over 30% of such tables do not conform to the relational standard,\nfor which complex table-restructuring transformations are needed before these\ntables can be queried easily using SQL-based analytics tools. Unfortunately,\nthe required transformations are non-trivial to program, which has become a\nsubstantial pain point for technical and non-technical users alike, as\nevidenced by large numbers of forum questions in places like StackOverflow and\nExcel/Power-BI/Tableau forums.\n  We develop an Auto-Tables system that can automatically synthesize pipelines\nwith multi-step transformations (in Python or other languages), to transform\nnon-relational tables into standard relational forms for downstream analytics,\nobviating the need for users to manually program transformations. We compile an\nextensive benchmark for this new task, by collecting 244 real test cases from\nuser spreadsheets and online forums. Our evaluation suggests that Auto-Tables\ncan successfully synthesize transformations for over 70% of test cases at\ninteractive speeds, without requiring any input from users, making this an\neffective tool for both technical and non-technical users to prepare data for\nanalytics.\n","authors":["Peng Li","Yeye He","Cong Yan","Yue Wang","Surajit Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2307.14565v2.pdf","comment":"full version of a paper accepted to VLDB 2023"},{"id":"http://arxiv.org/abs/2308.04704v1","updated":"2023-08-09T04:51:28Z","published":"2023-08-09T04:51:28Z","title":"A Feature Set of Small Size for the PDF Malware Detection","summary":"  Machine learning (ML)-based malware detection systems are becoming\nincreasingly important as malware threats increase and get more sophisticated.\nPDF files are often used as vectors for phishing attacks because they are\nwidely regarded as trustworthy data resources, and are accessible across\ndifferent platforms. Therefore, researchers have developed many different PDF\nmalware detection methods. Performance in detecting PDF malware is greatly\ninfluenced by feature selection. In this research, we propose a small features\nset that don't require too much domain knowledge of the PDF file. We evaluate\nproposed features with six different machine learning models. We report the\nbest accuracy of 99.75% when using Random Forest model. Our proposed feature\nset, which consists of just 12 features, is one of the most conciseness in the\nfield of PDF malware detection. Despite its modest size, we obtain comparable\nresults to state-of-the-art that employ a much larger set of features.\n","authors":["Ran Liu","Charles Nicholas"],"pdf_url":"https://arxiv.org/pdf/2308.04704v1.pdf","comment":"Accepted for publication at the KDD workshop on Knowledge-infused\n  Machine Learning, 2023"},{"id":"http://arxiv.org/abs/2307.03759v2","updated":"2023-08-09T04:40:52Z","published":"2023-07-07T08:05:03Z","title":"A Survey on Graph Neural Networks for Time Series: Forecasting,\n  Classification, Imputation, and Anomaly Detection","summary":"  Time series are the primary data type used to record dynamic system\nmeasurements and generated in great volume by both physical sensors and online\nprocesses (virtual sensors). Time series analytics is therefore crucial to\nunlocking the wealth of information implicit in available data. With the recent\nadvancements in graph neural networks (GNNs), there has been a surge in\nGNN-based approaches for time series analysis. These approaches can explicitly\nmodel inter-temporal and inter-variable relationships, which traditional and\nother deep neural network-based methods struggle to do. In this survey, we\nprovide a comprehensive review of graph neural networks for time series\nanalysis (GNN4TS), encompassing four fundamental dimensions: forecasting,\nclassification, anomaly detection, and imputation. Our aim is to guide\ndesigners and practitioners to understand, build applications, and advance\nresearch of GNN4TS. At first, we provide a comprehensive task-oriented taxonomy\nof GNN4TS. Then, we present and discuss representative research works and\nintroduce mainstream applications of GNN4TS. A comprehensive discussion of\npotential future research directions completes the survey. This survey, for the\nfirst time, brings together a vast array of knowledge on GNN-based time series\nresearch, highlighting foundations, practical applications, and opportunities\nof graph neural networks for time series analysis.\n","authors":["Ming Jin","Huan Yee Koh","Qingsong Wen","Daniele Zambon","Cesare Alippi","Geoffrey I. Webb","Irwin King","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2307.03759v2.pdf","comment":"Ongoing work; 27 pages, 6 figures, 5 tables; Github page:\n  https://github.com/KimMeen/Awesome-GNN4TS"},{"id":"http://arxiv.org/abs/2305.02396v2","updated":"2023-08-09T04:23:45Z","published":"2023-05-03T19:33:49Z","title":"Can Feature Engineering Help Quantum Machine Learning for Malware\n  Detection?","summary":"  With the increasing number and sophistication of malware attacks, malware\ndetection systems based on machine learning (ML) grow in importance. At the\nsame time, many popular ML models used in malware classification are supervised\nsolutions. These supervised classifiers often do not generalize well to novel\nmalware. Therefore, they need to be re-trained frequently to detect new malware\nspecimens, which can be time-consuming. Our work addresses this problem in a\nhybrid framework of theoretical Quantum ML, combined with feature selection\nstrategies to reduce the data size and malware classifier training time. The\npreliminary results show that VQC with XGBoost selected features can get a\n78.91% test accuracy on the simulator. The average accuracy for the model\ntrained using the features selected with XGBoost was 74% (+- 11.35%) on the IBM\n5 qubits machines.\n","authors":["Ran Liu","Maksim Eren","Charles Nicholas"],"pdf_url":"https://arxiv.org/pdf/2305.02396v2.pdf","comment":"Malware Technical Exchange Meeting 2022 (MTEM'22)"},{"id":"http://arxiv.org/abs/2308.04697v1","updated":"2023-08-09T04:16:48Z","published":"2023-08-09T04:16:48Z","title":"An Analytical Study of Covid-19 Dataset using Graph-Based Clustering\n  Algorithms","summary":"  Corona VIrus Disease abbreviated as COVID-19 is a novel virus which is\ninitially identified in Wuhan of China in December of 2019 and now this deadly\ndisease has spread all over the world. According to World Health Organization\n(WHO), a total of 3,124,905 people died from 2019 to 2021, April. In this case,\nmany methods, AI base techniques, and machine learning algorithms have been\nresearched and are being used to save people from this pandemic. The SARS-CoV\nand the 2019-nCoV, SARS-CoV-2 virus invade our bodies, causing some differences\nin the structure of cell proteins. Protein-protein interaction (PPI) is an\nessential process in our cells and plays a very important role in the\ndevelopment of medicines and gives ideas about the disease. In this study, we\nperformed clustering on PPI networks generated from 92 genes of the Covi-19\ndataset. We have used three graph-based clustering algorithms to give intuition\nto the analysis of clusters.\n","authors":["Mamata Das","P. J. A. Alphonse","Selvakumar K"],"pdf_url":"https://arxiv.org/pdf/2308.04697v1.pdf","comment":"9 pages, 28 figures, Fifth International Conference on Smart\n  Computing and Informatics (SCI 2021)"},{"id":"http://arxiv.org/abs/2308.04696v1","updated":"2023-08-09T04:15:10Z","published":"2023-08-09T04:15:10Z","title":"Explainable AI in Orthopedics: Challenges, Opportunities, and Prospects","summary":"  While artificial intelligence (AI) has made many successful applications in\nvarious domains, its adoption in healthcare lags a little bit behind other\nhigh-stakes settings. Several factors contribute to this slower uptake,\nincluding regulatory frameworks, patient privacy concerns, and data\nheterogeneity. However, one significant challenge that impedes the\nimplementation of AI in healthcare, particularly in orthopedics, is the lack of\nexplainability and interpretability around AI models. Addressing the challenge\nof explainable AI (XAI) in orthopedics requires developing AI models and\nalgorithms that prioritize transparency and interpretability, allowing\nclinicians, surgeons, and patients to understand the contributing factors\nbehind any AI-powered predictive or descriptive models. The current\ncontribution outlines several key challenges and opportunities that manifest in\nXAI in orthopedic practice. This work emphasizes the need for interdisciplinary\ncollaborations between AI practitioners, orthopedic specialists, and regulatory\nentities to establish standards and guidelines for the adoption of XAI in\northopedics.\n","authors":["Soheyla Amirian","Luke A. Carlson","Matthew F. Gong","Ines Lohse","Kurt R. Weiss","Johannes F. Plate","Ahmad P. Tafti"],"pdf_url":"https://arxiv.org/pdf/2308.04696v1.pdf","comment":"This paper was accepted at The 2023 World Congress in Computer\n  Science, Computer Engineering, and Applied Computing (CSCE'23)"},{"id":"http://arxiv.org/abs/2307.08873v2","updated":"2023-08-09T04:11:23Z","published":"2023-07-17T22:08:27Z","title":"An Alternative to Variance: Gini Deviation for Risk-averse Policy\n  Gradient","summary":"  Restricting the variance of a policy's return is a popular choice in\nrisk-averse Reinforcement Learning (RL) due to its clear mathematical\ndefinition and easy interpretability. Traditional methods directly restrict the\ntotal return variance. Recent methods restrict the per-step reward variance as\na proxy. We thoroughly examine the limitations of these variance-based methods,\nsuch as sensitivity to numerical scale and hindering of policy learning, and\npropose to use an alternative risk measure, Gini deviation, as a substitute. We\nstudy various properties of this new risk measure and derive a policy gradient\nalgorithm to minimize it. Empirical evaluation in domains where risk-aversion\ncan be clearly defined, shows that our algorithm can mitigate the limitations\nof variance-based risk measures and achieves high return with low risk in terms\nof variance and Gini deviation when others fail to learn a reasonable policy.\n","authors":["Yudong Luo","Guiliang Liu","Pascal Poupart","Yangchen Pan"],"pdf_url":"https://arxiv.org/pdf/2307.08873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04690v1","updated":"2023-08-09T03:56:07Z","published":"2023-08-09T03:56:07Z","title":"Finite Element Operator Network for Solving Parametric PDEs","summary":"  Partial differential equations (PDEs) underlie our understanding and\nprediction of natural phenomena across numerous fields, including physics,\nengineering, and finance. However, solving parametric PDEs is a complex task\nthat necessitates efficient numerical methods. In this paper, we propose a\nnovel approach for solving parametric PDEs using a Finite Element Operator\nNetwork (FEONet). Our proposed method leverages the power of deep learning in\nconjunction with traditional numerical methods, specifically the finite element\nmethod, to solve parametric PDEs in the absence of any paired input-output\ntraining data. We demonstrate the effectiveness of our approach on several\nbenchmark problems and show that it outperforms existing state-of-the-art\nmethods in terms of accuracy, generalization, and computational flexibility.\nOur FEONet framework shows potential for application in various fields where\nPDEs play a crucial role in modeling complex domains with diverse boundary\nconditions and singular behavior. Furthermore, we provide theoretical\nconvergence analysis to support our approach, utilizing finite element\napproximation in numerical analysis.\n","authors":["Jae Yong Lee","Seungchan Ko","Youngjoon Hong"],"pdf_url":"https://arxiv.org/pdf/2308.04690v1.pdf","comment":"25 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.04669v1","updated":"2023-08-09T02:27:23Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":"  Recently, a variety of Neural radiance fields methods have garnered\nremarkable success in high render speed. However, current accelerating methods\nis specialized and not compatible for various implicit method, which prevent a\nreal-time composition over different kinds of NeRF works. Since NeRF relies on\nsampling along rays, it's possible to provide a guidance generally. We propose\na general implicit pipeline to rapidly compose NeRF objects. This new method\nenables the casting of dynamic shadows within or between objects using\nanalytical light sources while allowing multiple NeRF objects to be seamlessly\nplaced and rendered together with any arbitrary rigid transformations. Mainly,\nour work introduces a new surface representation known as Neural Depth Fields\n(NeDF) that quickly determines the spatial relationship between objects by\nallowing direct intersection computation between rays and implicit surfaces. It\nleverages an intersection neural network to query NeRF for acceleration instead\nof depending on an explicit spatial structure.Our proposed method is the first\nto enable both the progressive and interactive composition of NeRF objects.\nAdditionally, it also serves as a previewing plugin for a range of existing\nNeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v1.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.04663v1","updated":"2023-08-09T02:04:05Z","published":"2023-08-09T02:04:05Z","title":"Classification of lung cancer subtypes on CT images with synthetic\n  pathological priors","summary":"  The accurate diagnosis on pathological subtypes for lung cancer is of\nsignificant importance for the follow-up treatments and prognosis managements.\nIn this paper, we propose self-generating hybrid feature network (SGHF-Net) for\naccurately classifying lung cancer subtypes on computed tomography (CT) images.\nInspired by studies stating that cross-scale associations exist in the image\npatterns between the same case's CT images and its pathological images, we\ninnovatively developed a pathological feature synthetic module (PFSM), which\nquantitatively maps cross-modality associations through deep neural networks,\nto derive the \"gold standard\" information contained in the corresponding\npathological images from CT images. Additionally, we designed a radiological\nfeature extraction module (RFEM) to directly acquire CT image information and\nintegrated it with the pathological priors under an effective feature fusion\nframework, enabling the entire classification model to generate more indicative\nand specific pathologically related features and eventually output more\naccurate predictions. The superiority of the proposed model lies in its ability\nto self-generate hybrid features that contain multi-modality image information\nbased on a single-modality input. To evaluate the effectiveness, adaptability,\nand generalization ability of our model, we performed extensive experiments on\na large-scale multi-center dataset (i.e., 829 cases from three hospitals) to\ncompare our model and a series of state-of-the-art (SOTA) classification\nmodels. The experimental results demonstrated the superiority of our model for\nlung cancer subtypes classification with significant accuracy improvements in\nterms of accuracy (ACC), area under the curve (AUC), and F1 score.\n","authors":["Wentao Zhu","Yuan Jin","Gege Ma","Geng Chen","Jan Egger","Shaoting Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.04663v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.04660v1","updated":"2023-08-09T01:56:10Z","published":"2023-08-09T01:56:10Z","title":"Efficient Bayesian Optimization with Deep Kernel Learning and\n  Transformer Pre-trained on Multiple Heterogeneous Datasets","summary":"  Bayesian optimization (BO) is widely adopted in black-box optimization\nproblems and it relies on a surrogate model to approximate the black-box\nresponse function. With the increasing number of black-box optimization tasks\nsolved and even more to solve, the ability to learn from multiple prior tasks\nto jointly pre-train a surrogate model is long-awaited to further boost\noptimization efficiency. In this paper, we propose a simple approach to\npre-train a surrogate, which is a Gaussian process (GP) with a kernel defined\non deep features learned from a Transformer-based encoder, using datasets from\nprior tasks with possibly heterogeneous input spaces. In addition, we provide a\nsimple yet effective mix-up initialization strategy for input tokens\ncorresponding to unseen input variables and therefore accelerate new tasks'\nconvergence. Experiments on both synthetic and real benchmark problems\ndemonstrate the effectiveness of our proposed pre-training and transfer BO\nstrategy over existing methods.\n","authors":["Wenlong Lyu","Shoubo Hu","Jie Chuai","Zhitang Chen"],"pdf_url":"https://arxiv.org/pdf/2308.04660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04653v1","updated":"2023-08-09T01:38:58Z","published":"2023-08-09T01:38:58Z","title":"Assessing the performance of deep learning-based models for prostate\n  cancer segmentation using uncertainty scores","summary":"  This study focuses on comparing deep learning methods for the segmentation\nand quantification of uncertainty in prostate segmentation from MRI images. The\naim is to improve the workflow of prostate cancer detection and diagnosis.\nSeven different U-Net-based architectures, augmented with Monte-Carlo dropout,\nare evaluated for automatic segmentation of the central zone, peripheral zone,\ntransition zone, and tumor, with uncertainty estimation. The top-performing\nmodel in this study is the Attention R2U-Net, achieving a mean Intersection\nover Union (IoU) of 76.3% and Dice Similarity Coefficient (DSC) of 85% for\nsegmenting all zones. Additionally, Attention R2U-Net exhibits the lowest\nuncertainty values, particularly in the boundaries of the transition zone and\ntumor, when compared to the other models.\n","authors":["Pablo Cesar Quihui-Rubio","Daniel Flores-Araiza","Gilberto Ochoa-Ruiz","Miguel Gonzalez-Mendoza","Christian Mata"],"pdf_url":"https://arxiv.org/pdf/2308.04653v1.pdf","comment":"Article accepted at Cancer Prevention through early detecTion\n  (CaPtTion) workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.04650v1","updated":"2023-08-09T01:30:07Z","published":"2023-08-09T01:30:07Z","title":"Deep Metric Learning for the Hemodynamics Inference with\n  Electrocardiogram Signals","summary":"  Heart failure is a debilitating condition that affects millions of people\nworldwide and has a significant impact on their quality of life and mortality\nrates. An objective assessment of cardiac pressures remains an important method\nfor the diagnosis and treatment prognostication for patients with heart\nfailure. Although cardiac catheterization is the gold standard for estimating\ncentral hemodynamic pressures, it is an invasive procedure that carries\ninherent risks, making it a potentially dangerous procedure for some patients.\nApproaches that leverage non-invasive signals - such as electrocardiogram (ECG)\n- have the promise to make the routine estimation of cardiac pressures feasible\nin both inpatient and outpatient settings. Prior models trained to estimate\nintracardiac pressures (e.g., mean pulmonary capillary wedge pressure (mPCWP))\nin a supervised fashion have shown good discriminatory ability but have been\nlimited to the labeled dataset from the heart failure cohort. To address this\nissue and build a robust representation, we apply deep metric learning (DML)\nand propose a novel self-supervised DML with distance-based mining that\nimproves the performance of a model with limited labels. We use a dataset that\ncontains over 5.4 million ECGs without concomitant central pressure labels to\npre-train a self-supervised DML model which showed improved classification of\nelevated mPCWP compared to self-supervised contrastive baselines. Additionally,\nthe supervised DML model that is using ECGs with access to 8,172 mPCWP labels\ndemonstrated significantly better performance on the mPCWP regression task\ncompared to the supervised baseline. Moreover, our data suggest that DML yields\nmodels that are performant across patient subgroups, even when some patient\nsubgroups are under-represented in the dataset. Our code is available at\nhttps://github.com/mandiehyewon/ssldml\n","authors":["Hyewon Jeong","Collin M. Stultz","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2308.04650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04649v1","updated":"2023-08-09T01:27:04Z","published":"2023-08-09T01:27:04Z","title":"Enhancing Optimization Performance: A Novel Hybridization of Gaussian\n  Crunching Search and Powell's Method for Derivative-Free Optimization","summary":"  This research paper presents a novel approach to enhance optimization\nperformance through the hybridization of Gaussian Crunching Search (GCS) and\nPowell's Method for derivative-free optimization. While GCS has shown promise\nin overcoming challenges faced by traditional derivative-free optimization\nmethods [1], it may not always excel in finding the local minimum. On the other\nhand, some traditional methods may have better performance in this regard.\nHowever, GCS demonstrates its strength in escaping the trap of local minima and\napproaching the global minima. Through experimentation, we discovered that by\ncombining GCS with certain traditional derivative-free optimization methods, we\ncan significantly boost performance while retaining the respective advantages\nof each method. This hybrid approach opens up new possibilities for optimizing\ncomplex systems and finding optimal solutions in a range of applications.\n","authors":["Benny Wong"],"pdf_url":"https://arxiv.org/pdf/2308.04649v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2304.00409v2","updated":"2023-08-09T01:21:50Z","published":"2023-04-01T23:29:14Z","title":"DiverseVul: A New Vulnerable Source Code Dataset for Deep Learning Based\n  Vulnerability Detection","summary":"  We propose and release a new vulnerable source code dataset. We curate the\ndataset by crawling security issue websites, extracting vulnerability-fixing\ncommits and source codes from the corresponding projects. Our new dataset\ncontains 18,945 vulnerable functions spanning 150 CWEs and 330,492\nnon-vulnerable functions extracted from 7,514 commits. Our dataset covers 295\nmore projects than all previous datasets combined.\n  Combining our new dataset with previous datasets, we present an analysis of\nthe challenges and promising research directions of using deep learning for\ndetecting software vulnerabilities. We study 11 model architectures belonging\nto 4 families. Our results show that deep learning is still not ready for\nvulnerability detection, due to high false positive rate, low F1 score, and\ndifficulty of detecting hard CWEs. In particular, we demonstrate an important\ngeneralization challenge for the deployment of deep learning-based models. We\nshow that increasing the volume of training data may not further improve the\nperformance of deep learning models for vulnerability detection, but might be\nuseful to improve the generalization ability to unseen projects.\n  We also identify hopeful future research directions. We demonstrate that\nlarge language models (LLMs) are a promising research direction for ML-based\nvulnerability detection, outperforming Graph Neural Networks (GNNs) with\ncode-structure features in our experiments. Moreover, developing source code\nspecific pre-training objectives is a promising research direction to improve\nthe vulnerability detection performance.\n","authors":["Yizheng Chen","Zhoujie Ding","Lamya Alowain","Xinyun Chen","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2304.00409v2.pdf","comment":"Published at RAID 2023"},{"id":"http://arxiv.org/abs/2307.12449v2","updated":"2023-08-09T01:16:34Z","published":"2023-07-23T22:35:09Z","title":"WEPRO: Weight Prediction for Efficient Optimization of Hybrid\n  Quantum-Classical Algorithms","summary":"  The exponential run time of quantum simulators on classical machines and long\nqueue depths and high costs of real quantum devices present significant\nchallenges in the effective training of Variational Quantum Algorithms (VQAs)\nlike Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and\nQuantum Approximate Optimization Algorithm (QAOA). To address these\nlimitations, we propose a new approach, WEPRO (Weight Prediction), which\naccelerates the convergence of VQAs by exploiting regular trends in the\nparameter weights. We introduce two techniques for optimal prediction\nperformance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).\nThrough extensive experimentation and training of multiple QNN models on\nvarious datasets, we demonstrate that WEPRO offers a speedup of approximately\n$2.25\\times$ compared to standard training methods, while also providing\nimproved accuracy (up to $2.3\\%$ higher) and loss (up to $6.1\\%$ lower) with\nlow storage and computational overheads. We also evaluate WEPRO's effectiveness\nin VQE for molecular ground-state energy estimation and in QAOA for graph\nMaxCut. Our results show that WEPRO leads to speed improvements of up to\n$3.1\\times$ for VQE and $2.91\\times$ for QAOA, compared to traditional\noptimization techniques, while using up to $3.3\\times$ less number of shots\n(i.e., repeated circuit executions) per training iteration.\n","authors":["Satwik Kundu","Debarshi Kundu","Swaroop Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.12449v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2301.07820v2","updated":"2023-08-09T00:44:38Z","published":"2023-01-18T23:16:53Z","title":"Emergence of the SVD as an interpretable factorization in deep learning\n  for inverse problems","summary":"  Within the framework of deep learning we demonstrate the emergence of the\nsingular value decomposition (SVD) of the weight matrix as a tool for\ninterpretation of neural networks (NN) when combined with the descrambling\ntransformation--a recently-developed technique for addressing interpretability\nin noisy parameter estimation neural networks \\cite{amey2021neural}. By\nconsidering the averaging effect of the data passed to the descrambling\nminimization problem, we show that descrambling transformations--in the large\ndata limit--can be expressed in terms of the SVD of the NN weights and the\ninput autocorrelation matrix. Using this fact, we show that within the class of\nnoisy parameter estimation problems the SVD may be the structure through which\ntrained networks encode a signal model. We substantiate our theoretical\nfindings with empirical evidence from both linear and non-linear signal models.\nOur results also illuminate the connections between a mathematical theory of\nsemantic development \\cite{saxe2019mathematical} and neural network\ninterpretability.\n","authors":["Shashank Sule","Richard G. Spencer","Wojciech Czaja"],"pdf_url":"https://arxiv.org/pdf/2301.07820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06775v2","updated":"2023-08-09T00:30:31Z","published":"2023-07-06T16:04:46Z","title":"A Novel Site-Agnostic Multimodal Deep Learning Model to Identify\n  Pro-Eating Disorder Content on Social Media","summary":"  Over the last decade, there has been a vast increase in eating disorder\ndiagnoses and eating disorder-attributed deaths, reaching their zenith during\nthe Covid-19 pandemic. This immense growth derived in part from the stressors\nof the pandemic but also from increased exposure to social media, which is rife\nwith content that promotes eating disorders. This study aimed to create a\nmultimodal deep learning model that can determine if a given social media post\npromotes eating disorders based on a combination of visual and textual data. A\nlabeled dataset of Tweets was collected from Twitter, upon which twelve deep\nlearning models were trained and tested. Based on model performance, the most\neffective deep learning model was the multimodal fusion of the RoBERTa natural\nlanguage processing model and the MaxViT image classification model, attaining\naccuracy and F1 scores of 95.9% and 0.959, respectively. The RoBERTa and MaxViT\nfusion model, deployed to classify an unlabeled dataset of posts from the\nsocial media sites Tumblr and Reddit, generated results akin to those of\nprevious research studies that did not employ artificial intelligence-based\ntechniques, indicating that deep learning models can develop insights congruent\nto those of researchers. Additionally, the model was used to conduct a\ntimeseries analysis of yet unseen Tweets from eight Twitter hashtags,\nuncovering that, since 2014, the relative abundance of content that promotes\neating disorders has decreased drastically within those communities. Despite\nthis reduction, by 2018, content that promotes eating disorders had either\nstopped declining or increased in ampleness anew on these hashtags.\n","authors":["Jonathan Feldman"],"pdf_url":"https://arxiv.org/pdf/2307.06775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.12126v3","updated":"2023-08-09T00:30:08Z","published":"2022-10-21T17:33:14Z","title":"One-Shot Neural Fields for 3D Object Understanding","summary":"  We present a unified and compact scene representation for robotics, where\neach object in the scene is depicted by a latent code capturing geometry and\nappearance. This representation can be decoded for various tasks such as novel\nview rendering, 3D reconstruction (e.g. recovering depth, point clouds, or\nvoxel maps), collision checking, and stable grasp prediction. We build our\nrepresentation from a single RGB input image at test time by leveraging recent\nadvances in Neural Radiance Fields (NeRF) that learn category-level priors on\nlarge multiview datasets, then fine-tune on novel objects from one or few\nviews. We expand the NeRF model for additional grasp outputs and explore ways\nto leverage this representation for robotics. At test-time, we build the\nrepresentation from a single RGB input image observing the scene from only one\nviewpoint. We find that the recovered representation allows rendering from\nnovel views, including of occluded object parts, and also for predicting\nsuccessful stable grasps. Grasp poses can be directly decoded from our latent\nrepresentation with an implicit grasp decoder. We experimented in both\nsimulation and real world and demonstrated the capability for robust robotic\ngrasping using such compact representation. Website:\nhttps://nerfgrasp.github.io\n","authors":["Valts Blukis","Taeyeop Lee","Jonathan Tremblay","Bowen Wen","In So Kweon","Kuk-Jin Yoon","Dieter Fox","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2210.12126v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition\n  Workshop (CVPRW) on XRNeRF: Advances in NeRF for the Metaverse 2023"},{"id":"http://arxiv.org/abs/2308.04637v1","updated":"2023-08-09T00:23:04Z","published":"2023-08-09T00:23:04Z","title":"Sparse Binary Transformers for Multivariate Time Series Modeling","summary":"  Compressed Neural Networks have the potential to enable deep learning across\nnew applications and smaller computational environments. However, understanding\nthe range of learning tasks in which such models can succeed is not well\nstudied. In this work, we apply sparse and binary-weighted Transformers to\nmultivariate time series problems, showing that the lightweight models achieve\naccuracy comparable to that of dense floating-point Transformers of the same\nstructure. Our model achieves favorable results across three time series\nlearning tasks: classification, anomaly detection, and single-step forecasting.\nAdditionally, to reduce the computational complexity of the attention\nmechanism, we apply two modifications, which show little to no decline in model\nperformance: 1) in the classification task, we apply a fixed mask to the query,\nkey, and value activations, and 2) for forecasting and anomaly detection, which\nrely on predicting outputs at a single point in time, we propose an attention\nmask to allow computation only at the current time step. Together, each\ncompression technique and attention modification substantially reduces the\nnumber of non-zero operations necessary in the Transformer. We measure the\ncomputational savings of our approach over a range of metrics including\nparameter count, bit size, and floating point operation (FLOPs) count, showing\nup to a 53x reduction in storage size and up to 10.5x reduction in FLOPs.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2308.04637v1.pdf","comment":"Published at KDD '23"},{"id":"http://arxiv.org/abs/2307.12943v2","updated":"2023-08-09T00:07:01Z","published":"2023-07-24T17:15:38Z","title":"Efficiently Sampling the PSD Cone with the Metric Dikin Walk","summary":"  Semi-definite programs represent a frontier of efficient computation. While\nthere has been much progress on semi-definite optimization, with moderate-sized\ninstances currently solvable in practice by the interior-point method, the\nbasic problem of sampling semi-definite solutions remains a formidable\nchallenge. The direct application of known polynomial-time algorithms for\nsampling general convex bodies to semi-definite sampling leads to a\nprohibitively high running time. In addition, known general methods require an\nexpensive rounding phase as pre-processing. Here we analyze the Dikin walk, by\nfirst adapting it to general metrics, then devising suitable metrics for the\nPSD cone with affine constraints. The resulting mixing time and per-step\ncomplexity are considerably smaller, and by an appropriate choice of the\nmetric, the dependence on the number of constraints can be made\npolylogarithmic. We introduce a refined notion of self-concordant matrix\nfunctions and give rules for combining different metrics. Along the way, we\nfurther develop the theory of interior-point methods for sampling.\n","authors":["Yunbum Kook","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2307.12943v2.pdf","comment":"Fix typos and avoid using Calabi estimates in Section 6"},{"id":"http://arxiv.org/abs/2211.14946v2","updated":"2023-08-09T00:04:38Z","published":"2022-11-27T21:43:45Z","title":"Self-Destructing Models: Increasing the Costs of Harmful Dual Uses of\n  Foundation Models","summary":"  A growing ecosystem of large, open-source foundation models has reduced the\nlabeled data and technical expertise necessary to apply machine learning to\nmany new problems. Yet foundation models pose a clear dual-use risk,\nindiscriminately reducing the costs of building both harmful and beneficial\nmachine learning systems. Policy tools such as restricted model access and\nexport controls are the primary methods currently used to mitigate such\ndual-use risks. In this work, we review potential safe-release strategies and\nargue that both policymakers and AI researchers would benefit from\nfundamentally new technologies enabling more precise control over the\ndownstream usage of open-source foundation models. We propose one such\napproach: the task blocking paradigm, in which foundation models are trained\nwith an additional mechanism to impede adaptation to harmful tasks without\nsacrificing performance on desirable tasks. We call the resulting models\nself-destructing models, inspired by mechanisms that prevent adversaries from\nusing tools for harmful purposes. We present an algorithm for training\nself-destructing models leveraging techniques from meta-learning and\nadversarial learning, which we call meta-learned adversarial censoring (MLAC).\nIn a small-scale experiment, we show MLAC can largely prevent a BERT-style\nmodel from being re-purposed to perform gender identification without harming\nthe model's ability to perform profession classification.\n","authors":["Peter Henderson","Eric Mitchell","Christopher D. Manning","Dan Jurafsky","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2211.14946v2.pdf","comment":"v1 Presented at the First Workshop of Pre-training: Perspectives,\n  Pitfalls, and Paths Forward (ICML, 2022) and New Frontiers in Adversarial\n  Machine Learning Workshop (ICML, 2022); v2 Presented at the Sixth AAAI/ACM\n  Conference on AI, Ethics, and Society (AIES, 2023)"},{"id":"http://arxiv.org/abs/2307.15644v2","updated":"2023-08-09T23:51:22Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2202.13341v3","updated":"2023-08-09T23:29:15Z","published":"2022-02-27T11:29:08Z","title":"Overlooked Implications of the Reconstruction Loss for VAE\n  Disentanglement","summary":"  Learning disentangled representations with variational autoencoders (VAEs) is\noften attributed to the regularisation component of the loss. In this work, we\nhighlight the interaction between data and the reconstruction term of the loss\nas the main contributor to disentanglement in VAEs. We show that standard\nbenchmark datasets have unintended correlations between their subjective\nground-truth factors and perceived axes in the data according to typical VAE\nreconstruction losses. Our work exploits this relationship to provide a theory\nfor what constitutes an adversarial dataset under a given reconstruction loss.\nWe verify this by constructing an example dataset that prevents disentanglement\nin state-of-the-art frameworks while maintaining human-intuitive ground-truth\nfactors. Finally, we re-enable disentanglement by designing an example\nreconstruction loss that is once again able to perceive the ground-truth\nfactors. Our findings demonstrate the subjective nature of disentanglement and\nthe importance of considering the interaction between the ground-truth factors,\ndata and notably, the reconstruction loss, which is under-recognised in the\nliterature.\n","authors":["Nathan Michlo","Richard Klein","Steven James"],"pdf_url":"https://arxiv.org/pdf/2202.13341v3.pdf","comment":"13 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.05254v1","updated":"2023-08-09T23:18:53Z","published":"2023-08-09T23:18:53Z","title":"Data-driven Intra-Autonomous Systems Graph Generator","summary":"  This paper introduces a novel deep-learning based generator of synthetic\ngraphs that represent intra-Autonomous System (AS) in the Internet, named\nDeep-generative graphs for the Internet (DGGI). It also presents a novel\nmassive dataset of real intra-AS graphs extracted from the project Internet\nTopology Data Kit (ITDK), called Internet Graphs (IGraphs). To create IGraphs,\nthe Filtered Recurrent Multi-level (FRM) algorithm for community extraction was\ndeveloped. It is shown that DGGI creates synthetic graphs which accurately\nreproduce the properties of centrality, clustering, assortativity, and node\ndegree. The DGGI generator overperforms existing Internet topology generators.\nOn average, DGGI improves the Maximum Mean Discrepancy (MMD) metric 84.4%,\n95.1%, 97.9%, and 94.7% for assortativity, betweenness, clustering, and node\ndegree, respectively.\n","authors":["Caio Vinicius Dadauto","Nelson Luis Saldanha da Fonseca","Ricardo da Silva Torres"],"pdf_url":"https://arxiv.org/pdf/2308.05254v1.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.16021v2","updated":"2023-08-09T22:55:00Z","published":"2023-06-28T08:48:40Z","title":"Structure in Reinforcement Learning: A Survey and Open Problems","summary":"  Reinforcement Learning (RL), bolstered by the expressive capabilities of Deep\nNeural Networks (DNNs) for function approximation, has demonstrated\nconsiderable success in numerous applications. However, its practicality in\naddressing various real-world scenarios, characterized by diverse and\nunpredictable dynamics, noisy signals, and large state and action spaces,\nremains limited. This limitation stems from issues such as poor data\nefficiency, limited generalization capabilities, a lack of safety guarantees,\nand the absence of interpretability, among other factors. To overcome these\nchallenges and improve performance across these crucial metrics, one promising\navenue is to incorporate additional structural information about the problem\ninto the RL learning process. Various sub-fields of RL have proposed methods\nfor incorporating such inductive biases. We amalgamate these diverse\nmethodologies under a unified framework, shedding light on the role of\nstructure in the learning problem, and classify these methods into distinct\npatterns of incorporating structure. By leveraging this comprehensive\nframework, we provide valuable insights into the challenges of structured RL\nand lay the groundwork for a design pattern perspective on RL research. This\nnovel perspective paves the way for future advancements and aids in developing\nmore effective and efficient RL algorithms that can potentially handle\nreal-world scenarios better.\n","authors":["Aditya Mohan","Amy Zhang","Marius Lindauer"],"pdf_url":"https://arxiv.org/pdf/2306.16021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05239v1","updated":"2023-08-09T21:54:34Z","published":"2023-08-09T21:54:34Z","title":"AI-Enabled Software and System Architecture Frameworks: Focusing on\n  smart Cyber-Physical Systems (CPS)","summary":"  Several architecture frameworks for software, systems, and enterprises have\nbeen proposed in the literature. They identified various stakeholders and\ndefined architecture viewpoints and views to frame and address stakeholder\nconcerns. However, the stakeholders with data science and Machine Learning (ML)\nrelated concerns, such as data scientists and data engineers, are yet to be\nincluded in existing architecture frameworks. Therefore, they failed to address\nthe architecture viewpoints and views responsive to the concerns of the data\nscience community. In this paper, we address this gap by establishing the\narchitecture frameworks adapted to meet the requirements of modern applications\nand organizations where ML artifacts are both prevalent and crucial. In\nparticular, we focus on ML-enabled Cyber-Physical Systems (CPSs) and propose\ntwo sets of merit criteria for their efficient development and performance\nassessment, namely the criteria for evaluating and benchmarking ML-enabled\nCPSs, and the criteria for evaluation and benchmarking of the tools intended to\nsupport users through the modeling and development pipeline. In this study, we\ndeploy multiple empirical and qualitative research methods based on literature\nreview and survey instruments including expert interviews and an online\nquestionnaire. We collect, analyze, and integrate the opinions of 77 experts\nfrom more than 25 organizations in over 10 countries to devise and validate the\nproposed framework.\n","authors":["Armin Moin","Atta Badii","Stephan Günnemann","Moharram Challenger"],"pdf_url":"https://arxiv.org/pdf/2308.05239v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.05237v1","updated":"2023-08-09T21:47:50Z","published":"2023-08-09T21:47:50Z","title":"Financial Fraud Detection: A Comparative Study of Quantum Machine\n  Learning Models","summary":"  In this research, a comparative study of four Quantum Machine Learning (QML)\nmodels was conducted for fraud detection in finance. We proved that the Quantum\nSupport Vector Classifier model achieved the highest performance, with F1\nscores of 0.98 for fraud and non-fraud classes. Other models like the\nVariational Quantum Classifier, Estimator Quantum Neural Network (QNN), and\nSampler QNN demonstrate promising results, propelling the potential of QML\nclassification for financial applications. While they exhibit certain\nlimitations, the insights attained pave the way for future enhancements and\noptimisation strategies. However, challenges exist, including the need for more\nefficient Quantum algorithms and larger and more complex datasets. The article\nprovides solutions to overcome current limitations and contributes new insights\nto the field of Quantum Machine Learning in fraud detection, with important\nimplications for its future development.\n","authors":["Nouhaila Innan","Muhammad Al-Zafar Khan","Mohamed Bennai"],"pdf_url":"https://arxiv.org/pdf/2308.05237v1.pdf","comment":"30 pages, 15 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2308.05235v1","updated":"2023-08-09T21:39:57Z","published":"2023-08-09T21:39:57Z","title":"Spatial Gated Multi-Layer Perceptron for Land Use and Land Cover Mapping","summary":"  Convolutional Neural Networks (CNNs) are models that are utilized extensively\nfor the hierarchical extraction of features. Vision transformers (ViTs),\nthrough the use of a self-attention mechanism, have recently achieved superior\nmodeling of global contextual information compared to CNNs. However, to realize\ntheir image classification strength, ViTs require substantial training\ndatasets. Where the available training data are limited, current advanced\nmulti-layer perceptrons (MLPs) can provide viable alternatives to both deep\nCNNs and ViTs. In this paper, we developed the SGU-MLP, a learning algorithm\nthat effectively uses both MLPs and spatial gating units (SGUs) for precise\nland use land cover (LULC) mapping. Results illustrated the superiority of the\ndeveloped SGU-MLP classification algorithm over several CNN and CNN-ViT-based\nmodels, including HybridSN, ResNet, iFormer, EfficientFormer and CoAtNet. The\nproposed SGU-MLP algorithm was tested through three experiments in Houston,\nUSA, Berlin, Germany and Augsburg, Germany. The SGU-MLP classification model\nwas found to consistently outperform the benchmark CNN and CNN-ViT-based\nalgorithms. For example, for the Houston experiment, SGU-MLP significantly\noutperformed HybridSN, CoAtNet, Efficientformer, iFormer and ResNet by\napproximately 15%, 19%, 20%, 21%, and 25%, respectively, in terms of average\naccuracy. The code will be made publicly available at\nhttps://github.com/aj1365/SGUMLP\n","authors":["Ali Jamali","Swalpa Kumar Roy","Danfeng Hong","Peter M Atkinson","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2308.05235v1.pdf","comment":"Submitted in IEEE"},{"id":"http://arxiv.org/abs/2308.05234v1","updated":"2023-08-09T21:39:10Z","published":"2023-08-09T21:39:10Z","title":"Leveraging the Edge and Cloud for V2X-Based Real-Time Object Detection\n  in Autonomous Driving","summary":"  Environmental perception is a key element of autonomous driving because the\ninformation received from the perception module influences core driving\ndecisions. An outstanding challenge in real-time perception for autonomous\ndriving lies in finding the best trade-off between detection quality and\nlatency. Major constraints on both computation and power have to be taken into\naccount for real-time perception in autonomous vehicles. Larger object\ndetection models tend to produce the best results, but are also slower at\nruntime. Since the most accurate detectors cannot run in real-time locally, we\ninvestigate the possibility of offloading computation to edge and cloud\nplatforms, which are less resource-constrained. We create a synthetic dataset\nto train object detection models and evaluate different offloading strategies.\nUsing real hardware and network simulations, we compare different trade-offs\nbetween prediction quality and end-to-end delay. Since sending raw frames over\nthe network implies additional transmission delays, we also explore the use of\nJPEG and H.265 compression at varying qualities and measure their impact on\nprediction metrics. We show that models with adequate compression can be run in\nreal-time on the cloud while outperforming local detection performance.\n","authors":["Faisal Hawlader","François Robinet","Raphaël Frank"],"pdf_url":"https://arxiv.org/pdf/2308.05234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05232v1","updated":"2023-08-09T21:30:18Z","published":"2023-08-09T21:30:18Z","title":"SegMatch: A semi-supervised learning method for surgical instrument\n  segmentation","summary":"  Surgical instrument segmentation is recognised as a key enabler to provide\nadvanced surgical assistance and improve computer assisted interventions. In\nthis work, we propose SegMatch, a semi supervised learning method to reduce the\nneed for expensive annotation for laparoscopic and robotic surgical images.\nSegMatch builds on FixMatch, a widespread semi supervised classification\npipeline combining consistency regularization and pseudo labelling, and adapts\nit for the purpose of segmentation. In our proposed SegMatch, the unlabelled\nimages are weakly augmented and fed into the segmentation model to generate a\npseudo-label to enforce the unsupervised loss against the output of the model\nfor the adversarial augmented image on the pixels with a high confidence score.\nOur adaptation for segmentation tasks includes carefully considering the\nequivariance and invariance properties of the augmentation functions we rely\non. To increase the relevance of our augmentations, we depart from using only\nhandcrafted augmentations and introduce a trainable adversarial augmentation\nstrategy. Our algorithm was evaluated on the MICCAI Instrument Segmentation\nChallenge datasets Robust-MIS 2019 and EndoVis 2017. Our results demonstrate\nthat adding unlabelled data for training purposes allows us to surpass the\nperformance of fully supervised approaches which are limited by the\navailability of training data in these challenges. SegMatch also outperforms a\nrange of state-of-the-art semi-supervised learning semantic segmentation models\nin different labelled to unlabelled data ratios.\n","authors":["Meng Wei","Charlie Budd","Luis C. Garcia-Peraza-Herrera","Reuben Dorent","Miaojing Shi","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2308.05232v1.pdf","comment":"preprint under review, 12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.03805v2","updated":"2023-08-09T21:15:46Z","published":"2023-06-06T15:49:09Z","title":"The Emergence of Essential Sparsity in Large Pre-trained Models: The\n  Weights that Matter","summary":"  Large pre-trained transformers are show-stealer in modern-day deep learning,\nand it becomes crucial to comprehend the parsimonious patterns that exist\nwithin them as they grow in scale. With exploding parameter counts, Lottery\nTicket Hypothesis (LTH) and its variants, have lost their pragmatism in\nsparsifying them due to high computation and memory bottleneck of repetitive\ntrain-prune-retrain routine of iterative magnitude pruning (IMP) which worsens\nwith increasing model size. This paper comprehensively studies induced sparse\npatterns across multiple large pre-trained vision and language transformers. We\npropose the existence of -- essential sparsity defined with a sharp dropping\npoint beyond which the performance declines much faster w.r.t the rise of\nsparsity level, when we directly remove weights with the smallest magnitudes in\none-shot without re-training. We also find essential sparsity to hold valid for\nN:M sparsity patterns as well as on modern-scale large language models\n(Vicuna-7B). We also present an intriguing emerging phenomenon of abrupt\nsparsification during the pre-training of BERT, i.e., BERT suddenly becomes\nheavily sparse in pre-training after certain iterations. Moreover, our\nobservations also indicate a counter-intuitive finding that BERT trained with a\nlarger amount of pre-training data tends to have a better ability to condense\nknowledge in comparatively relatively fewer parameters. Lastly, we investigate\nthe effect of the pre-training loss on essential sparsity and discover that\nself-supervised learning (SSL) objectives trigger stronger emergent\nsparsification properties than supervised learning (SL). Our codes are\navailable at \\url{https://github.com/VITA-Group/essential_sparsity}.\n","authors":["Ajay Jaiswal","Shiwei Liu","Tianlong Chen","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.03805v2.pdf","comment":"Added new results on LLMs and N:M Sparsity"},{"id":"http://arxiv.org/abs/2308.05226v1","updated":"2023-08-09T21:11:26Z","published":"2023-08-09T21:11:26Z","title":"Training neural networks with end-to-end optical backpropagation","summary":"  Optics is an exciting route for the next generation of computing hardware for\nmachine learning, promising several orders of magnitude enhancement in both\ncomputational speed and energy efficiency. However, to reach the full capacity\nof an optical neural network it is necessary that the computing not only for\nthe inference, but also for the training be implemented optically. The primary\nalgorithm for training a neural network is backpropagation, in which the\ncalculation is performed in the order opposite to the information flow for\ninference. While straightforward in a digital computer, optical implementation\nof backpropagation has so far remained elusive, particularly because of the\nconflicting requirements for the optical element that implements the nonlinear\nactivation function. In this work, we address this challenge for the first time\nwith a surprisingly simple and generic scheme. Saturable absorbers are employed\nfor the role of the activation units, and the required properties are achieved\nthrough a pump-probe process, in which the forward propagating signal acts as\nthe pump and backward as the probe. Our approach is adaptable to various analog\nplatforms, materials, and network structures, and it demonstrates the\npossibility of constructing neural networks entirely reliant on analog optical\nprocesses for both training and inference tasks.\n","authors":["James Spall","Xianxin Guo","A. I. Lvovsky"],"pdf_url":"https://arxiv.org/pdf/2308.05226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05219v1","updated":"2023-08-09T20:53:22Z","published":"2023-08-09T20:53:22Z","title":"Decoding Layer Saliency in Language Transformers","summary":"  In this paper, we introduce a strategy for identifying textual saliency in\nlarge-scale language models applied to classification tasks. In visual networks\nwhere saliency is more well-studied, saliency is naturally localized through\nthe convolutional layers of the network; however, the same is not true in\nmodern transformer-stack networks used to process natural language. We adapt\ngradient-based saliency methods for these networks, propose a method for\nevaluating the degree of semantic coherence of each layer, and demonstrate\nconsistent improvement over numerous other methods for textual saliency on\nmultiple benchmark classification datasets. Our approach requires no additional\ntraining or access to labelled data, and is comparatively very computationally\nefficient.\n","authors":["Elizabeth M. Hou","Gregory Castanon"],"pdf_url":"https://arxiv.org/pdf/2308.05219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05218v1","updated":"2023-08-09T20:51:54Z","published":"2023-08-09T20:51:54Z","title":"Conformer-based Target-Speaker Automatic Speech Recognition for\n  Single-Channel Audio","summary":"  We propose CONF-TSASR, a non-autoregressive end-to-end time-frequency domain\narchitecture for single-channel target-speaker automatic speech recognition\n(TS-ASR). The model consists of a TitaNet based speaker embedding module, a\nConformer based masking as well as ASR modules. These modules are jointly\noptimized to transcribe a target-speaker, while ignoring speech from other\nspeakers. For training we use Connectionist Temporal Classification (CTC) loss\nand introduce a scale-invariant spectrogram reconstruction loss to encourage\nthe model better separate the target-speaker's spectrogram from mixture. We\nobtain state-of-the-art target-speaker word error rate (TS-WER) on\nWSJ0-2mix-extr (4.2%). Further, we report for the first time TS-WER on\nWSJ0-3mix-extr (12.4%), LibriSpeech2Mix (4.2%) and LibriSpeech3Mix (7.6%)\ndatasets, establishing new benchmarks for TS-ASR. The proposed model will be\nopen-sourced through NVIDIA NeMo toolkit.\n","authors":["Yang Zhang","Krishna C. Puvvada","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2308.05218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.04702v2","updated":"2023-08-09T19:56:28Z","published":"2022-11-09T06:27:18Z","title":"A survey of some recent developments in measures of association","summary":"  This paper surveys some recent developments in measures of association\nrelated to a new coefficient of correlation introduced by the author. A\nstraightforward extension of this coefficient to standard Borel spaces (which\nincludes all Polish spaces), overlooked in the literature so far, is proposed\nat the end of the survey.\n","authors":["Sourav Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2211.04702v2.pdf","comment":"22 pages. Minor changes in this revision"},{"id":"http://arxiv.org/abs/2308.05194v1","updated":"2023-08-09T19:21:50Z","published":"2023-08-09T19:21:50Z","title":"Evaluating Pedestrian Trajectory Prediction Methods for the Application\n  in Autonomous Driving","summary":"  In this paper, the state of the art in the field of pedestrian trajectory\nprediction is evaluated alongside the constant velocity model (CVM) with\nrespect to its applicability in autonomous vehicles. The evaluation is\nconducted on the widely-used ETH/UCY dataset where the Average Displacement\nError (ADE) and the Final Displacement Error (FDE) are reported. To align with\nrequirements in real-world applications, modifications are made to the input\nfeatures of the initially proposed models. An ablation study is conducted to\nexamine the influence of the observed motion history on the prediction\nperformance, thereby establishing a better understanding of its impact.\nAdditionally, the inference time of each model is measured to evaluate the\nscalability of each model when confronted with varying amounts of agents. The\nresults demonstrate that simple models remain competitive when generating\nsingle trajectories, and certain features commonly thought of as useful have\nlittle impact on the overall performance across different architectures. Based\non these findings, recommendations are proposed to guide the future development\nof trajectory prediction algorithms.\n","authors":["Nico Uhlemann","Felix Fent","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2308.05194v1.pdf","comment":"Submitted to the IEEE Transactions on Intelligent Transportation\n  Systems (T-ITS); 9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.05189v1","updated":"2023-08-09T18:49:21Z","published":"2023-08-09T18:49:21Z","title":"Hierarchical Representations for Spatio-Temporal Visual Attention\n  Modeling and Understanding","summary":"  This PhD. Thesis concerns the study and development of hierarchical\nrepresentations for spatio-temporal visual attention modeling and understanding\nin video sequences. More specifically, we propose two computational models for\nvisual attention. First, we present a generative probabilistic model for\ncontext-aware visual attention modeling and understanding. Secondly, we develop\na deep network architecture for visual attention modeling, which first\nestimates top-down spatio-temporal visual attention, and ultimately serves for\nmodeling attention in the temporal domain.\n","authors":["Miguel-Ángel Fernández-Torres"],"pdf_url":"https://arxiv.org/pdf/2308.05189v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2306.13759v2","updated":"2023-08-09T18:43:47Z","published":"2023-06-23T19:46:02Z","title":"Incremental Profit per Conversion: a Response Transformation for Uplift\n  Modeling in E-Commerce Promotions","summary":"  Promotions play a crucial role in e-commerce platforms, and various cost\nstructures are employed to drive user engagement. This paper focuses on\npromotions with response-dependent costs, where expenses are incurred only when\na purchase is made. Such promotions include discounts and coupons. While\nexisting uplift model approaches aim to address this challenge, these\napproaches often necessitate training multiple models, like meta-learners, or\nencounter complications when estimating profit due to zero-inflated values\nstemming from non-converted individuals with zero cost and profit.\n  To address these challenges, we introduce Incremental Profit per Conversion\n(IPC), a novel uplift measure of promotional campaigns' efficiency in unit\neconomics. Through a proposed response transformation, we demonstrate that IPC\nrequires only converted data, its propensity, and a single model to be\nestimated. As a result, IPC resolves the issues mentioned above while\nmitigating the noise typically associated with the class imbalance in\nconversion datasets and biases arising from the many-to-one mapping between\nsearch and purchase data. Lastly, we validate the efficacy of our approach by\npresenting results obtained from a synthetic simulation of a discount coupon\ncampaign.\n","authors":["Hugo Manuel Proença","Felipe Moraes"],"pdf_url":"https://arxiv.org/pdf/2306.13759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14408v2","updated":"2023-08-09T18:30:48Z","published":"2022-09-28T20:36:49Z","title":"RALACs: Action Recognition in Autonomous Vehicles using Interaction\n  Encoding and Optical Flow","summary":"  When applied to autonomous vehicle (AV) settings, action recognition can\nenhance an environment model's situational awareness. This is especially\nprevalent in scenarios where traditional geometric descriptions and heuristics\nin AVs are insufficient. However, action recognition has traditionally been\nstudied for humans, and its limited adaptability to noisy, un-clipped,\nun-pampered, raw RGB data has limited its application in other fields. To push\nfor the advancement and adoption of action recognition into AVs, this work\nproposes a novel two-stage action recognition system, termed RALACs. RALACs\nformulates the problem of action recognition for road scenes, and bridges the\ngap between it and the established field of human action recognition. This work\nshows how attention layers can be useful for encoding the relations across\nagents, and stresses how such a scheme can be class-agnostic. Furthermore, to\naddress the dynamic nature of agents on the road, RALACs constructs a novel\napproach to adapting Region of Interest (ROI) Alignment to agent tracks for\ndownstream action classification. Finally, our scheme also considers the\nproblem of active agent detection, and utilizes a novel application of fusing\noptical flow maps to discern relevant agents in a road scene. We show that our\nproposed scheme can outperform the baseline on the ICCV2021 Road Challenge\ndataset and by deploying it on a real vehicle platform, we provide preliminary\ninsight to the usefulness of action recognition in decision making.\n","authors":["Eddy Zhou","Alex Zhuang","Alikasim Budhwani","Rowan Dempster","Quanquan Li","Mohammad Al-Sharman","Derek Rayside","William Melek"],"pdf_url":"https://arxiv.org/pdf/2209.14408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05166v1","updated":"2023-08-09T18:10:05Z","published":"2023-08-09T18:10:05Z","title":"Deep Learning for Morphological Identification of Extended Radio\n  Galaxies using Weak Labels","summary":"  The present work discusses the use of a weakly-supervised deep learning\nalgorithm that reduces the cost of labelling pixel-level masks for complex\nradio galaxies with multiple components. The algorithm is trained on weak\nclass-level labels of radio galaxies to get class activation maps (CAMs). The\nCAMs are further refined using an inter-pixel relations network (IRNet) to get\ninstance segmentation masks over radio galaxies and the positions of their\ninfrared hosts. We use data from the Australian Square Kilometre Array\nPathfinder (ASKAP) telescope, specifically the Evolutionary Map of the Universe\n(EMU) Pilot Survey, which covered a sky area of 270 square degrees with an RMS\nsensitivity of 25-35 $\\mu$Jy/beam. We demonstrate that weakly-supervised deep\nlearning algorithms can achieve high accuracy in predicting pixel-level\ninformation, including masks for the extended radio emission encapsulating all\ngalaxy components and the positions of the infrared host galaxies. We evaluate\nthe performance of our method using mean Average Precision (mAP) across\nmultiple classes at a standard intersection over union (IoU) threshold of 0.5.\nWe show that the model achieves a mAP$_{50}$ of 67.5\\% and 76.8\\% for radio\nmasks and infrared host positions, respectively. The network architecture can\nbe found at the following link: https://github.com/Nikhel1/Gal-CAM\n","authors":["Nikhel Gupta","Zeeshan Hayder","Ray P. Norris","Minh Huynh","Lars Petersson","X. Rosalind Wang","Heinz Andernach","Bärbel S. Koribalski","Miranda Yew","Evan J. Crawford"],"pdf_url":"https://arxiv.org/pdf/2308.05166v1.pdf","comment":"14 pages, 6 figues, accepted for publication in PASA"},{"id":"http://arxiv.org/abs/2308.05141v1","updated":"2023-08-09T16:32:51Z","published":"2023-08-09T16:32:51Z","title":"Sound propagation in realistic interactive 3D scenes with parameterized\n  sources using deep neural operators","summary":"  We address the challenge of sound propagation simulations in $3$D virtual\nrooms with moving sources, which have applications in virtual/augmented\nreality, game audio, and spatial computing. Solutions to the wave equation can\ndescribe wave phenomena such as diffraction and interference. However,\nsimulating them using conventional numerical discretization methods with\nhundreds of source and receiver positions is intractable, making stimulating a\nsound field with moving sources impractical. To overcome this limitation, we\npropose using deep operator networks to approximate linear wave-equation\noperators. This enables the rapid prediction of sound propagation in realistic\n3D acoustic scenes with moving sources, achieving millisecond-scale\ncomputations. By learning a compact surrogate model, we avoid the offline\ncalculation and storage of impulse responses for all relevant source/listener\npairs. Our experiments, including various complex scene geometries, show good\nagreement with reference solutions, with root mean squared errors ranging from\n0.02 Pa to 0.10 Pa. Notably, our method signifies a paradigm shift as no prior\nmachine learning approach has achieved precise predictions of complete wave\nfields within realistic domains. We anticipate that our findings will drive\nfurther exploration of deep neural operator methods, advancing research in\nimmersive user experiences within virtual environments.\n","authors":["Nikolas Borrel-Jensen","Somdatta Goswami","Allan P. Engsig-Karup","George Em Karniadakis","Cheol-Ho Jeong"],"pdf_url":"https://arxiv.org/pdf/2308.05141v1.pdf","comment":"25 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2209.07902v5","updated":"2023-08-09T14:30:48Z","published":"2022-09-16T12:54:17Z","title":"MetaMask: Revisiting Dimensional Confounder for Self-Supervised Learning","summary":"  As a successful approach to self-supervised learning, contrastive learning\naims to learn invariant information shared among distortions of the input\nsample. While contrastive learning has yielded continuous advancements in\nsampling strategy and architecture design, it still remains two persistent\ndefects: the interference of task-irrelevant information and sample\ninefficiency, which are related to the recurring existence of trivial constant\nsolutions. From the perspective of dimensional analysis, we find out that the\ndimensional redundancy and dimensional confounder are the intrinsic issues\nbehind the phenomena, and provide experimental evidence to support our\nviewpoint. We further propose a simple yet effective approach MetaMask, short\nfor the dimensional Mask learned by Meta-learning, to learn representations\nagainst dimensional redundancy and confounder. MetaMask adopts the\nredundancy-reduction technique to tackle the dimensional redundancy issue and\ninnovatively introduces a dimensional mask to reduce the gradient effects of\nspecific dimensions containing the confounder, which is trained by employing a\nmeta-learning paradigm with the objective of improving the performance of\nmasked representations on a typical self-supervised task. We provide solid\ntheoretical analyses to prove MetaMask can obtain tighter risk bounds for\ndownstream classification compared to typical contrastive methods. Empirically,\nour method achieves state-of-the-art performance on various benchmarks.\n","authors":["Jiangmeng Li","Wenwen Qiang","Yanan Zhang","Wenyi Mo","Changwen Zheng","Bing Su","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2209.07902v5.pdf","comment":"Accepted by NeurIPS 2022 as Spotlight"},{"id":"http://arxiv.org/abs/2308.05133v1","updated":"2023-08-09T13:13:26Z","published":"2023-08-09T13:13:26Z","title":"Analyzing the Effect of Data Impurity on the Detection Performances of\n  Mental Disorders","summary":"  The primary method for identifying mental disorders automatically has\ntraditionally involved using binary classifiers. These classifiers are trained\nusing behavioral data obtained from an interview setup. In this training\nprocess, data from individuals with the specific disorder under consideration\nare categorized as the positive class, while data from all other participants\nconstitute the negative class. In practice, it is widely recognized that\ncertain mental disorders share similar symptoms, causing the collected\nbehavioral data to encompass a variety of attributes associated with multiple\ndisorders. Consequently, attributes linked to the targeted mental disorder\nmight also be present within the negative class. This data impurity may lead to\nsub-optimal training of the classifier for a mental disorder of interest. In\nthis study, we investigate this hypothesis in the context of major depressive\ndisorder (MDD) and post-traumatic stress disorder detection (PTSD). The results\nshow that upon removal of such data impurity, MDD and PTSD detection\nperformances are significantly improved.\n","authors":["Rohan Kumar Gupta","Rohit Sinha"],"pdf_url":"https://arxiv.org/pdf/2308.05133v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.05037v1","updated":"2023-08-09T16:09:44Z","published":"2023-08-09T16:09:44Z","title":"Separate Anything You Describe","summary":"  Language-queried audio source separation (LASS) is a new paradigm for\ncomputational auditory scene analysis (CASA). LASS aims to separate a target\nsound from an audio mixture given a natural language query, which provides a\nnatural and scalable interface for digital audio applications. Recent works on\nLASS, despite attaining promising separation performance on specific sources\n(e.g., musical instruments, limited classes of audio events), are unable to\nseparate audio concepts in the open domain. In this work, we introduce\nAudioSep, a foundation model for open-domain audio source separation with\nnatural language queries. We train AudioSep on large-scale multimodal datasets\nand extensively evaluate its capabilities on numerous tasks including audio\nevent separation, musical instrument separation, and speech enhancement.\nAudioSep demonstrates strong separation performance and impressive zero-shot\ngeneralization ability using audio captions or text labels as queries,\nsubstantially outperforming previous audio-queried and language-queried sound\nseparation models. For reproducibility of this work, we will release the source\ncode, evaluation benchmark and pre-trained model at:\nhttps://github.com/Audio-AGI/AudioSep.\n","authors":["Xubo Liu","Qiuqiang Kong","Yan Zhao","Haohe Liu","Yi Yuan","Yuzhuo Liu","Rui Xia","Yuxuan Wang","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05037v1.pdf","comment":"Project Page:\n  https://audio-agi.github.io/Separate-Anything-You-Describe; Code:\n  https://github.com/Audio-AGI/AudioSep"},{"id":"http://arxiv.org/abs/2308.04767v1","updated":"2023-08-09T07:55:12Z","published":"2023-08-09T07:55:12Z","title":"Induction Network: Audio-Visual Modality Gap-Bridging for\n  Self-Supervised Sound Source Localization","summary":"  Self-supervised sound source localization is usually challenged by the\nmodality inconsistency. In recent studies, contrastive learning based\nstrategies have shown promising to establish such a consistent correspondence\nbetween audio and sound sources in visual scenarios. Unfortunately, the\ninsufficient attention to the heterogeneity influence in the different modality\nfeatures still limits this scheme to be further improved, which also becomes\nthe motivation of our work. In this study, an Induction Network is proposed to\nbridge the modality gap more effectively. By decoupling the gradients of visual\nand audio modalities, the discriminative visual representations of sound\nsources can be learned with the designed Induction Vector in a bootstrap\nmanner, which also enables the audio modality to be aligned with the visual\nmodality consistently. In addition to a visual weighted contrastive loss, an\nadaptive threshold selection strategy is introduced to enhance the robustness\nof the Induction Network. Substantial experiments conducted on SoundNet-Flickr\nand VGG-Sound Source datasets have demonstrated a superior performance compared\nto other state-of-the-art works in different challenging scenarios. The code is\navailable at https://github.com/Tahy1/AVIN\n","authors":["Tianyu Liu","Peng Zhang","Wei Huang","Yufei Zha","Tao You","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04767v1.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04729v1","updated":"2023-08-09T06:27:24Z","published":"2023-08-09T06:27:24Z","title":"JEN-1: Text-Guided Universal Music Generation with Omnidirectional\n  Diffusion Models","summary":"  Music generation has attracted growing interest with the advancement of deep\ngenerative models. However, generating music conditioned on textual\ndescriptions, known as text-to-music, remains challenging due to the complexity\nof musical structures and high sampling rate requirements. Despite the task's\nsignificance, prevailing generative models exhibit limitations in music\nquality, computational efficiency, and generalization. This paper introduces\nJEN-1, a universal high-fidelity model for text-to-music generation. JEN-1 is a\ndiffusion model incorporating both autoregressive and non-autoregressive\ntraining. Through in-context learning, JEN-1 performs various generation tasks\nincluding text-guided music generation, music inpainting, and continuation.\nEvaluations demonstrate JEN-1's superior performance over state-of-the-art\nmethods in text-music alignment and music quality while maintaining\ncomputational efficiency. Our demos are available at\nhttp://futureverse.com/research/jen/demos/jen1\n","authors":["Peike Li","Boyu Chen","Yao Yao","Yikai Wang","Allen Wang","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04723v1","updated":"2023-08-09T05:58:48Z","published":"2023-08-09T05:58:48Z","title":"A Forensic Methodology for Detecting Image Manipulations","summary":"  By applying artificial intelligence to image editing technology, it has\nbecome possible to generate high-quality images with minimal traces of\nmanipulation. However, since these technologies can be misused for criminal\nactivities such as dissemination of false information, destruction of evidence,\nand denial of facts, it is crucial to implement strong countermeasures. In this\nstudy, image file and mobile forensic artifacts analysis were conducted for\ndetecting image manipulation. Image file analysis involves parsing the metadata\nof manipulated images (e.g., Exif, DQT, and Filename Signature) and comparing\nthem with a Reference DB to detect manipulation. The Reference DB is a database\nthat collects manipulation-related traces left in image metadata, which serves\nas a criterion for detecting image manipulation. In the mobile forensic\nartifacts analysis, packages related to image editing tools were extracted and\nanalyzed to aid the detection of image manipulation. The proposed methodology\novercomes the limitations of existing graphic feature-based analysis and\ncombines with image processing techniques, providing the advantage of reducing\nfalse positives. The research results demonstrate the significant role of such\nmethodology in digital forensic investigation and analysis. Additionally, We\nprovide the code for parsing image metadata and the Reference DB along with the\ndataset of manipulated images, aiming to contribute to related research.\n","authors":["Jiwon Lee","Seungjae Jeon","Yunji Park","Jaehyun Chung","Doowon Jeong"],"pdf_url":"https://arxiv.org/pdf/2308.04723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03505v2","updated":"2023-08-09T03:28:44Z","published":"2023-04-07T06:53:49Z","title":"Improving Adaptive Real-Time Video Communication Via Cross-layer\n  Optimization","summary":"  Effective Adaptive BitRate (ABR) algorithm or policy is of paramount\nimportance for Real-Time Video Communication (RTVC) amid this pandemic to\npursue uncompromised quality of experience (QoE). Existing ABR methods mainly\nseparate the network bandwidth estimation and video encoder control, and\nfine-tune video bitrate towards estimated bandwidth, assuming the maximization\nof bandwidth utilization yields the optimal QoE. However, the QoE of a RTVC\nsystem is jointly determined by the quality of compressed video, fluency of\nvideo playback, and interaction delay. Solely maximizing the bandwidth\nutilization without comprehensively considering compound impacts incurred by\nboth network and video application layers, does not assure the satisfactory\nQoE. And the decoupling of network and video layer further exacerbates the user\nexperience due to network-codec incoordination. This work therefore proposes\nthe Palette, a reinforcement learning based ABR scheme that unifies the\nprocessing of network and video application layers to directly maximize the QoE\nformulated as the weighted function of video quality, stalling rate and delay.\nTo this aim, a cross-layer optimization is proposed to derive fine-grained\ncompression factor of upcoming frame(s) using cross-layer observations like\nnetwork conditions, video encoding parameters, and video content complexity. As\na result, Palette manages to resolve the network-codec incoordination and to\nbest catch up with the network fluctuation. Compared with state-of-the-art\nschemes in real-world tests, Palette not only reduces 3.1%-46.3% of the\nstalling rate, 20.2%-50.8% of the delay, but also improves 0.2%-7.2% of the\nvideo quality with comparable bandwidth consumption, under a variety of\napplication scenarios.\n","authors":["Yueheng Li","Hao Chen","Bowei Xu","Zicheng Zhang","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2304.03505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04672v1","updated":"2023-08-09T02:50:15Z","published":"2023-08-09T02:50:15Z","title":"Resource Constrained Model Compression via Minimax Optimization for\n  Spiking Neural Networks","summary":"  Brain-inspired Spiking Neural Networks (SNNs) have the characteristics of\nevent-driven and high energy-efficient, which are different from traditional\nArtificial Neural Networks (ANNs) when deployed on edge devices such as\nneuromorphic chips. Most previous work focuses on SNNs training strategies to\nimprove model performance and brings larger and deeper network architectures.\nIt is difficult to deploy these complex networks on resource-limited edge\ndevices directly. To meet such demand, people compress SNNs very cautiously to\nbalance the performance and the computation efficiency. Existing compression\nmethods either iteratively pruned SNNs using weights norm magnitude or\nformulated the problem as a sparse learning optimization. We propose an\nimproved end-to-end Minimax optimization method for this sparse learning\nproblem to better balance the model performance and the computation efficiency.\nWe also demonstrate that jointly applying compression and finetuning on SNNs is\nbetter than sequentially, especially for extreme compression ratios. The\ncompressed SNN models achieved state-of-the-art (SOTA) performance on various\nbenchmark datasets and architectures. Our code is available at\nhttps://github.com/chenjallen/Resource-Constrained-Compression-on-SNN.\n","authors":["Jue Chen","Huan Yuan","Jianchao Tan","Bin Chen","Chengru Song","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04672v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.07848v7","updated":"2023-08-09T08:30:53Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive cross-modality pretraining approaches have recently exhibited\nimpressive success in diverse fields. In this paper, we propose GEmo-CLAP, a\nkind of gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for speech emotion recognition (SER).Specifically, an effective emotion\nCLAP model (Emo-CLAP) is first built, using various self-supervised pre-trained\nmodels for SER. Second, given the significance of the gender attribute in\nspeech emotion modeling, two novel soft label based GEmo-CLAP (SL-GEmo-CLAP)\nand multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to\nincorporate gender information of speech signals, forming more reasonable\nobjectives. Experiments on IEMOCAP demonstrate that our proposed two GEmo-CLAPs\nconsistently outperform the baseline Emo-CLAP with various pre-trained models,\nwhile also achieving the best recognition performance compared with\nstate-of-the-art SER methods. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nmodel achieves the best UAR of 81.43\\% and WAR of 83.16\\%.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Jixun Yao","Wen Fei","Lei Ma","Heng Lu"],"pdf_url":"https://arxiv.org/pdf/2306.07848v7.pdf","comment":"5 pages"}]},"2023-08-10T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.05725v1","updated":"2023-08-10T17:41:19Z","published":"2023-08-10T17:41:19Z","title":"EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech\n  Resynthesis","summary":"  Recent work has shown that it is possible to resynthesize high-quality speech\nbased, not on text, but on low bitrate discrete units that have been learned in\na self-supervised fashion and can therefore capture expressive aspects of\nspeech that are hard to transcribe (prosody, voice styles, non-verbal\nvocalization). The adoption of these methods is still limited by the fact that\nmost speech synthesis datasets are read, severely limiting spontaneity and\nexpressivity. Here, we introduce Expresso, a high-quality expressive speech\ndataset for textless speech synthesis that includes both read speech and\nimprovised dialogues rendered in 26 spontaneous expressive styles. We\nillustrate the challenges and potentials of this dataset with an expressive\nresynthesis benchmark where the task is to encode the input in low-bitrate\nunits and resynthesize it in a target voice while preserving content and style.\nWe evaluate resynthesis quality with automatic metrics for different\nself-supervised discrete encoders, and explore tradeoffs between quality,\nbitrate and invariance to speaker and style. All the dataset, evaluation\nmetrics and baseline models are open source\n","authors":["Tu Anh Nguyen","Wei-Ning Hsu","Antony D'Avirro","Bowen Shi","Itai Gat","Maryam Fazel-Zarani","Tal Remez","Jade Copet","Gabriel Synnaeve","Michael Hassid","Felix Kreuk","Yossi Adi","Emmanuel Dupoux"],"pdf_url":"https://arxiv.org/pdf/2308.05725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05696v1","updated":"2023-08-10T16:58:51Z","published":"2023-08-10T16:58:51Z","title":"A Preliminary Study of the Intrinsic Relationship between Complexity and\n  Alignment","summary":"  Training large language models (LLMs) with open-domain instruction data has\nyielded remarkable success in aligning to end tasks and user preferences.\nExtensive research has highlighted that enhancing the quality and diversity of\ninstruction data consistently improves performance. However, the impact of data\ncomplexity, as a crucial metric, remains relatively unexplored in three\naspects: (1) scaling law, where the sustainability of performance improvements\nwith increasing complexity is uncertain, (2) additional tokens, whether the\nimprovement brought by complexity comes from introducing more training tokens,\nand (3) curriculum tuning, where the potential advantages of incorporating\ninstructions ranging from easy to difficult are not yet fully understood. In\nthis paper, we propose \\textit{tree-instruct} to systematically enhance the\ncomplexity of instruction data in a controllable manner. This approach adds a\nspecified number of nodes into the instruction semantic tree, yielding new\ninstruction data based on the modified tree. By adjusting the number of added\nnodes, we can control the difficulty level in the modified instruction data.\nOur preliminary experiments reveal the following insights: (1) Increasing\ncomplexity consistently leads to sustained performance improvements. For\ninstance, using 1,000 instruction data and 10 nodes resulted in a substantial\n24\\% increase in win rate. (2) Under the same token budget, a few complex\ninstructions outperform diverse yet simple instructions. (3) Curriculum\ninstruction tuning might not yield the anticipated results; focusing on\nincreasing complexity appears to be the key.\n","authors":["Yingxiu Zhao","Bowen Yu","Binyuan Hui","Haiyang Yu","Fei Huang","Yongbin Li","Nevin L. Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14679v2","updated":"2023-08-10T16:46:35Z","published":"2023-02-28T15:42:30Z","title":"Synthesizing Mixed-type Electronic Health Records using Diffusion Models","summary":"  Electronic Health Records (EHRs) contain sensitive patient information, which\npresents privacy concerns when sharing such data. Synthetic data generation is\na promising solution to mitigate these risks, often relying on deep generative\nmodels such as Generative Adversarial Networks (GANs). However, recent studies\nhave shown that diffusion models offer several advantages over GANs, such as\ngeneration of more realistic synthetic data and stable training in generating\ndata modalities, including image, text, and sound. In this work, we investigate\nthe potential of diffusion models for generating realistic mixed-type tabular\nEHRs, comparing TabDDPM model with existing methods on four datasets in terms\nof data quality, utility, privacy, and augmentation. Our experiments\ndemonstrate that TabDDPM outperforms the state-of-the-art models across all\nevaluation metrics, except for privacy, which confirms the trade-off between\nprivacy and utility.\n","authors":["Taha Ceritli","Ghadeer O. Ghosheh","Vinod Kumar Chauhan","Tingting Zhu","Andrew P. Creagh","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2302.14679v2.pdf","comment":"Page 2, Figure 1 is updated"},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n  Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":"  The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05646v1","updated":"2023-08-10T15:43:46Z","published":"2023-08-10T15:43:46Z","title":"AST-MHSA : Code Summarization using Multi-Head Self-Attention","summary":"  Code summarization aims to generate concise natural language descriptions for\nsource code. The prevailing approaches adopt transformer-based encoder-decoder\narchitectures, where the Abstract Syntax Tree (AST) of the source code is\nutilized for encoding structural information. However, ASTs are much longer\nthan the corresponding source code, and existing methods ignore this size\nconstraint by directly feeding the entire linearized AST into the encoders.\nThis simplistic approach makes it challenging to extract truly valuable\ndependency relations from the overlong input sequence and leads to significant\ncomputational overhead due to self-attention applied to all nodes in the AST.\n  To address this issue effectively and efficiently, we present a model,\nAST-MHSA that uses multi-head attention to extract the important semantic\ninformation from the AST. The model consists of two main components: an encoder\nand a decoder. The encoder takes as input the abstract syntax tree (AST) of the\ncode and generates a sequence of hidden states. The decoder then takes these\nhidden states as input and generates a natural language summary of the code.\n  The multi-head attention mechanism allows the model to learn different\nrepresentations of the input code, which can be combined to generate a more\ncomprehensive summary. The model is trained on a dataset of code and summaries,\nand the parameters of the model are optimized to minimize the loss between the\ngenerated summaries and the ground-truth summaries.\n","authors":["Yeshwanth Nagaraj","Ujjwal Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.05646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.02399v3","updated":"2023-08-10T15:31:54Z","published":"2021-12-04T18:34:24Z","title":"VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts","summary":"  Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention\nrecently for its transferable visual representation learning. However, due to\nthe semantic gap within datasets, CLIP's pre-trained image-text alignment\nbecomes sub-optimal on downstream tasks, which severely harms its transferring\nperformance. To better adapt the cross-modality embedding space, we propose to\nenhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide\ntextual features of different categories to adaptively explore informative\nregions on the image and aggregate visual features by attention mechanisms. In\nthis way, the texts become visual-guided, namely, more semantically correlated\nwith downstream images, which greatly benefits the category-wise matching\nprocess. In few-shot settings, we evaluate our VT-CLIP on 11 well-known\nclassification datasets to demonstrate its effectiveness.\n","authors":["Longtian Qiu","Renrui Zhang","Ziyu Guo","Ziyao Zeng","Zilu Guo","Yafeng Li","Guangnan Zhang"],"pdf_url":"https://arxiv.org/pdf/2112.02399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n  Transformer","summary":"  Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05609v1","updated":"2023-08-10T14:41:17Z","published":"2023-08-10T14:41:17Z","title":"LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition","summary":"  Biomedical Natural Language Processing (NLP) tends to become cumbersome for\nmost researchers, frequently due to the amount and heterogeneity of text to be\nprocessed. To address this challenge, the industry is continuously developing\nhighly efficient tools and creating more flexible engineering solutions. This\nwork presents the integration between industry data engineering solutions for\nefficient data processing and academic systems developed for Named Entity\nRecognition (LasigeUnicage\\_NER) and Relation Extraction (BiOnt). Our design\nreflects an integration of those components with external knowledge in the form\nof additional training data from other datasets and biomedical ontologies. We\nused this pipeline in the 2022 LitCoin NLP Challenge, where our team\nLasigeUnicage was awarded the 7th Prize out of approximately 200 participating\nteams, reflecting a successful collaboration between the academia (LASIGE) and\nthe industry (Unicage). The software supporting this work is available at\n\\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.\n","authors":["Pedro Ruas","Diana F. Sousa","André Neves","Carlos Cruz","Francisco M. Couto"],"pdf_url":"https://arxiv.org/pdf/2308.05609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03762v2","updated":"2023-08-10T14:24:37Z","published":"2023-07-21T17:04:25Z","title":"GPT-4 Can't Reason","summary":"  GPT-4 was released in March 2023 to wide acclaim, marking a very substantial\nimprovement across the board over GPT-3.5 (OpenAI's previously best model,\nwhich had powered the initial release of ChatGPT). However, despite the\ngenuinely impressive improvement, there are good reasons to be highly skeptical\nof GPT-4's ability to reason. This position paper discusses the nature of\nreasoning; criticizes the current formulation of reasoning problems in the NLP\ncommunity, as well as the way in which LLM reasoning performance is currently\nevaluated; introduces a small collection of 21 diverse reasoning problems; and\nperforms a detailed qualitative evaluation of GPT-4's performance on those\nproblems. Based on this analysis, the paper concludes that, despite its\noccasional flashes of analytical brilliance, GPT-4 at present is utterly\nincapable of reasoning.\n","authors":["Konstantine Arkoudas"],"pdf_url":"https://arxiv.org/pdf/2308.03762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05596v1","updated":"2023-08-10T14:14:13Z","published":"2023-08-10T14:14:13Z","title":"You Only Prompt Once: On the Capabilities of Prompt Learning on Large\n  Language Models to Tackle Toxic Content","summary":"  The spread of toxic content online is an important problem that has adverse\neffects on user experience online and in our society at large. Motivated by the\nimportance and impact of the problem, research focuses on developing solutions\nto detect toxic content, usually leveraging machine learning (ML) models\ntrained on human-annotated datasets. While these efforts are important, these\nmodels usually do not generalize well and they can not cope with new trends\n(e.g., the emergence of new toxic terms). Currently, we are witnessing a shift\nin the approach to tackling societal issues online, particularly leveraging\nlarge language models (LLMs) like GPT-3 or T5 that are trained on vast corpora\nand have strong generalizability. In this work, we investigate how we can use\nLLMs and prompt learning to tackle the problem of toxic content, particularly\nfocusing on three tasks; 1) Toxicity Classification, 2) Toxic Span Detection,\nand 3) Detoxification. We perform an extensive evaluation over five model\narchitectures and eight datasets demonstrating that LLMs with prompt learning\ncan achieve similar or even better performance compared to models trained on\nthese specific tasks. We find that prompt learning achieves around 10\\%\nimprovement in the toxicity classification task compared to the baselines,\nwhile for the toxic span detection task we find better performance to the best\nbaseline (0.643 vs. 0.640 in terms of $F_1$-score). Finally, for the\ndetoxification task, we find that prompt learning can successfully reduce the\naverage toxicity score (from 0.775 to 0.213) while preserving semantic meaning.\n","authors":["Xinlei He","Savvas Zannettou","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05596v1.pdf","comment":"To Appear in the 45th IEEE Symposium on Security and Privacy, May\n  20-23, 2024"},{"id":"http://arxiv.org/abs/2308.05576v1","updated":"2023-08-10T13:39:40Z","published":"2023-08-10T13:39:40Z","title":"Do Language Models Refer?","summary":"  What do language models (LMs) do with language? Everyone agrees that they\nproduce sequences of (mostly) coherent sentences. But are they saying anything\nwith those strings or simply babbling in a convincing simulacrum of language\nuse? This is a vague question, and there are many ways of making it precise.\nHere we will address one aspect of the question, namely, whether LMs' words\nrefer: that is, whether the outputs of LMs achieve \"word-to-world\" connections.\nThere is prima facie reason to think they do not since LMs do not interact with\nthe world in the way that ordinary language users do. Drawing on insights from\nthe externalist tradition in philosophy of language, we argue that appearances\nare misleading and that there is good reason to think that LMs can refer.\n","authors":["Matthew Mandelkern","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2308.05576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05574v1","updated":"2023-08-10T13:38:09Z","published":"2023-08-10T13:38:09Z","title":"Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual\n  Translation of Dravidian Languages","summary":"  Current research in zero-shot translation is plagued by several issues such\nas high compute requirements, increased training time and off target\ntranslations. Proposed remedies often come at the cost of additional data or\ncompute requirements. Pivot based neural machine translation is preferred over\na single-encoder model for most settings despite the increased training and\nevaluation time. In this work, we overcome the shortcomings of zero-shot\ntranslation by taking advantage of transliteration and linguistic similarity.\nWe build a single encoder-decoder neural machine translation system for\nDravidian-Dravidian multilingual translation and perform zero-shot translation.\nWe compare the data vs zero-shot accuracy tradeoff and evaluate the performance\nof our vanilla method against the current state of the art pivot based method.\nWe also test the theory that morphologically rich languages require large\nvocabularies by restricting the vocabulary using an optimal transport based\ntechnique. Our model manages to achieves scores within 3 BLEU of large-scale\npivot-based models when it is trained on 50\\% of the language directions.\n","authors":["Danish Ebadulla","Rahul Raman","S. Natarajan","Hridhay Kiran Shetty","Ashish Harish Shenoy"],"pdf_url":"https://arxiv.org/pdf/2308.05574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05502v1","updated":"2023-08-10T11:14:22Z","published":"2023-08-10T11:14:22Z","title":"Bringing order into the realm of Transformer-based language models for\n  artificial intelligence and law","summary":"  Transformer-based language models (TLMs) have widely been recognized to be a\ncutting-edge technology for the successful development of deep-learning-based\nsolutions to problems and applications that require natural language processing\nand understanding. Like for other textual domains, TLMs have indeed pushed the\nstate-of-the-art of AI approaches for many tasks of interest in the legal\ndomain. Despite the first Transformer model being proposed about six years ago,\nthere has been a rapid progress of this technology at an unprecedented rate,\nwhereby BERT and related models represent a major reference, also in the legal\ndomain. This article provides the first systematic overview of TLM-based\nmethods for AI-driven problems and tasks in the legal sphere. A major goal is\nto highlight research advances in this field so as to understand, on the one\nhand, how the Transformers have contributed to the success of AI in supporting\nlegal processes, and on the other hand, what are the current limitations and\nopportunities for further research development.\n","authors":["Candida M. Greco","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2308.05502v1.pdf","comment":"Accepted for publication with Artificial Intelligence and Law,\n  Springer Nature"},{"id":"http://arxiv.org/abs/2306.02130v2","updated":"2023-08-10T11:13:02Z","published":"2023-06-03T14:57:47Z","title":"Extending an Event-type Ontology: Adding Verbs and Classes Using\n  Fine-tuned LLMs Suggestions","summary":"  In this project, we have investigated the use of advanced machine learning\nmethods, specifically fine-tuned large language models, for pre-annotating data\nfor a lexical extension task, namely adding descriptive words (verbs) to an\nexisting (but incomplete, as of yet) ontology of event types. Several research\nquestions have been focused on, from the investigation of a possible heuristics\nto provide at least hints to annotators which verbs to include and which are\noutside the current version of the ontology, to the possible use of the\nautomatic scores to help the annotators to be more efficient in finding a\nthreshold for identifying verbs that cannot be assigned to any existing class\nand therefore they are to be used as seeds for a new class. We have also\ncarefully examined the correlation of the automatic scores with the human\nannotation. While the correlation turned out to be strong, its influence on the\nannotation proper is modest due to its near linearity, even though the mere\nfact of such pre-annotation leads to relatively short annotation times.\n","authors":["Jana Straková","Eva Fučíková","Jan Hajič","Zdeňka Urešová"],"pdf_url":"https://arxiv.org/pdf/2306.02130v2.pdf","comment":"Published at LAW-XVII @ ACL 2023"},{"id":"http://arxiv.org/abs/2304.03531v2","updated":"2023-08-10T10:52:39Z","published":"2023-04-07T08:09:50Z","title":"From Retrieval to Generation: Efficient and Effective Entity Set\n  Expansion","summary":"  Entity Set Expansion (ESE) is a critical task aiming to expand entities of\nthe target semantic class described by a small seed entity set. Most existing\nESE methods are retrieval-based frameworks that need to extract the contextual\nfeatures of entities and calculate the similarity between seed entities and\ncandidate entities. To achieve the two purposes, they should iteratively\ntraverse the corpus and the entity vocabulary provided in the datasets,\nresulting in poor efficiency and scalability. The experimental results indicate\nthat the time consumed by the retrieval-based ESE methods increases linearly\nwith entity vocabulary and corpus size. In this paper, we firstly propose a\ngenerative ESE framework, Generative Entity Set Expansion (GenExpan), which\nutilizes a generative pre-trained language model to accomplish ESE task.\nSpecifically, a prefix tree is employed to guarantee the validity of entity\ngeneration, and automatically generated class names are adopted to guide the\nmodel to generate target entities. Moreover, we propose Knowledge Calibration\nand Generative Ranking to further bridge the gap between generic knowledge of\nthe language model and the goal of ESE task. Experiments on publicly available\ndatasets show that GenExpan is efficient and effective. For efficiency,\nexpansion time consumed by GenExpan is independent of entity vocabulary and\ncorpus size, and GenExpan achieves an average 600% speedup compared to strong\nbaselines. For expansion performance, our framework outperforms previous\nstate-of-the-art ESE methods.\n","authors":["Shulin Huang","Shirong Ma","Yangning Li","Yinghui Li","Hai-Tao Zheng","Yong Jiang","Hong-Gee Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05481v1","updated":"2023-08-10T10:12:43Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":"  Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v1","updated":"2023-08-10T10:07:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n  Deceptive Text Classification: A Comparative Analysis","summary":"  Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive or fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.02697v2","updated":"2023-08-10T07:42:22Z","published":"2023-07-06T00:06:14Z","title":"Strahler Number of Natural Language Sentences in Comparison with Random\n  Trees","summary":"  The Strahler number was originally proposed to characterize the complexity of\nriver bifurcation and has found various applications. This article proposes\ncomputation of the Strahler number's upper and lower limits for natural\nlanguage sentence tree structures. Through empirical measurements across\ngrammatically annotated data, the Strahler number of natural language sentences\nis shown to be almost 3 or 4, similarly to the case of river bifurcation as\nreported by Strahler (1957). From the theory behind the number, we show that it\nis one kind of lower limit on the amount of memory required to process\nsentences. We consider the Strahler number to provide reasoning that explains\nreports showing that the number of required memory areas to process sentences\nis 3 to 4 for parsing (Abney and Johnson, 1991; Schuler et al., 2010), and\nreports indicating a psychological \"magical number\" of 3 to 5 (Cowan, 2001). An\nanalytical and empirical analysis shows that the Strahler number is not\nconstant but grows logarithmically; therefore, the Strahler number of sentences\nderives from the range of sentence lengths. Furthermore, the Strahler number is\nnot different for random trees, which could suggest that its origin is not\nspecific to natural language.\n","authors":["Kumiko Tanaka-Ishii","Akira Tanaka"],"pdf_url":"https://arxiv.org/pdf/2307.02697v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05361v1","updated":"2023-08-10T06:08:20Z","published":"2023-08-10T06:08:20Z","title":"WeaverBird: Empowering Financial Decision-Making with Large Language\n  Model, Knowledge Base, and Search Engine","summary":"  We present WeaverBird, an intelligent dialogue system designed specifically\nfor the finance domain. Our system harnesses a large language model of GPT\narchitecture that has been tuned using extensive corpora of finance-related\ntext. As a result, our system possesses the capability to understand complex\nfinancial queries, such as \"How should I manage my investments during\ninflation?\", and provide informed responses. Furthermore, our system\nincorporates a local knowledge base and a search engine to retrieve relevant\ninformation. The final responses are conditioned on the search results and\ninclude proper citations to the sources, thus enjoying an enhanced credibility.\nThrough a range of finance-related questions, we have demonstrated the superior\nperformance of our system compared to other models. To experience our system\nfirsthand, users can interact with our live demo at\nhttps://weaverbird.ttic.edu, as well as watch our 2-min video illustration at\nhttps://www.youtube.com/watch?v=yofgeqnlrMc.\n","authors":["Siqiao Xue","Fan Zhou","Yi Xu","Hongyu Zhao","Shuo Xie","Caigao Jiang","James Zhang","Jun Zhou","Peng Xu","Dacheng Xiu","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2308.05361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11679v2","updated":"2023-08-10T05:27:58Z","published":"2023-04-23T15:11:49Z","title":"Domain Mastery Benchmark: An Ever-Updating Benchmark for Evaluating\n  Holistic Domain Knowledge of Large Language Model--A Preliminary Release","summary":"  Domain knowledge refers to the in-depth understanding, expertise, and\nfamiliarity with a specific subject, industry, field, or area of special\ninterest. The existing benchmarks are all lack of an overall design for domain\nknowledge evaluation. Holding the belief that the real ability of domain\nlanguage understanding can only be fairly evaluated by an comprehensive and\nin-depth benchmark, we introduces the Domma, a Domain Mastery Benchmark. DomMa\ntargets at testing Large Language Models (LLMs) on their domain knowledge\nunderstanding, it features extensive domain coverage, large data volume, and a\ncontinually updated data set based on Chinese 112 first-level subject\nclassifications. DomMa consist of 100,000 questions in both Chinese and English\nsourced from graduate entrance examinations and undergraduate exams in Chinese\ncollege. We have also propose designs to make benchmark and evaluation process\nmore suitable to LLMs.\n","authors":["Zhouhong Gu","Xiaoxuan Zhu","Haoning Ye","Lin Zhang","Zhuozhi Xiong","Zihan Li","Qianyu He","Sihang Jiang","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2304.11679v2.pdf","comment":"The paper is updated, but we make a mistake that submit a new arxiv\n  paper but not replace this one, the new version is in arXiv:2306.05783"},{"id":"http://arxiv.org/abs/2308.05342v1","updated":"2023-08-10T05:10:17Z","published":"2023-08-10T05:10:17Z","title":"Metacognitive Prompting Improves Understanding in Large Language Models","summary":"  In Large Language Models (LLMs), there have been consistent advancements in\ntask-specific performance, largely influenced by effective prompt design. While\nrecent research on prompting has enhanced the reasoning capabilities of LLMs, a\ngap remains in further improving their understanding abilities. In this study,\nwe introduce metacognitive prompting (MP), a strategy inspired by human\nintrospective reasoning processes. Using MP, LLMs undergo a systematic series\nof structured, self-aware evaluations, drawing on both their vast inherent\nknowledge and new insights. Our experiments involve five prevalent LLMs:\nLlama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general\nnatural language understanding (NLU) tasks from the GLUE and SuperGLUE\nbenchmarks. Results indicate that, although GPT-4 consistently excels in most\ntasks, PaLM, when equipped with MP, approaches its performance level.\nFurthermore, across models and datasets, MP consistently outperforms existing\nprompting methods, including standard and chain-of-thought prompting. This\nstudy underscores the potential to amplify the understanding abilities of LLMs\nand highlights the benefits of mirroring human introspective reasoning in NLU\ntasks.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05342v1.pdf","comment":"9 pages, in submission"},{"id":"http://arxiv.org/abs/2308.05341v1","updated":"2023-08-10T05:09:42Z","published":"2023-08-10T05:09:42Z","title":"Classification of Human- and AI-Generated Texts: Investigating Features\n  for ChatGPT","summary":"  Recently, generative AIs like ChatGPT have become available to the wide\npublic. These tools can for instance be used by students to generate essays or\nwhole theses. But how does a teacher know whether a text is written by a\nstudent or an AI? In our work, we explore traditional and new features to (1)\ndetect text generated by AI from scratch and (2) text rephrased by AI. Since we\nfound that classification is more difficult when the AI has been instructed to\ncreate the text in a way that a human would not recognize that it was generated\nby an AI, we also investigate this more advanced case. For our experiments, we\nproduced a new text corpus covering 10 school topics. Our best systems to\nclassify basic and advanced human-generated/AI-generated texts have F1-scores\nof over 96%. Our best systems for classifying basic and advanced\nhuman-generated/AI-rephrased texts have F1-scores of more than 78%. The systems\nuse a combination of perplexity, semantic, list lookup, error-based,\nreadability, AI feedback, and text vector features. Our results show that the\nnew features substantially help to improve the performance of many classifiers.\nOur best basic text rephrasing detection system even outperforms GPTZero by\n183.8% relative in F1-score.\n","authors":["Lorenz Mindner","Tim Schlippe","Kristina Schaaff"],"pdf_url":"https://arxiv.org/pdf/2308.05341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05336v1","updated":"2023-08-10T04:57:34Z","published":"2023-08-10T04:57:34Z","title":"Developing an Informal-Formal Persian Corpus","summary":"  Informal language is a style of spoken or written language frequently used in\ncasual conversations, social media, weblogs, emails and text messages. In\ninformal writing, the language faces some lexical and/or syntactic changes\nvarying among different languages. Persian is one of the languages with many\ndifferences between its formal and informal styles of writing, thus developing\ninformal language processing tools for this language seems necessary. Such a\nconverter needs a large aligned parallel corpus of colloquial-formal sentences\nwhich can be useful for linguists to extract a regulated grammar and\northography for colloquial Persian as is done for the formal language. In this\npaper we explain our methodology in building a parallel corpus of 50,000\nsentence pairs with alignments in the word/phrase level. The sentences were\nattempted to cover almost all kinds of lexical and syntactic changes between\ninformal and formal Persian, therefore both methods of exploring and collecting\nfrom the different resources of informal scripts and following the phonological\nand morphological patterns of changes were applied to find as much instances as\npossible. The resulting corpus has about 530,000 alignments and a dictionary\ncontaining 49,397 word and phrase pairs.\n","authors":["Vahide Tajalli","Fateme Kalantari","Mehrnoush Shamsfard"],"pdf_url":"https://arxiv.org/pdf/2308.05336v1.pdf","comment":"16 pages, 1 Figure and 3 tables"},{"id":"http://arxiv.org/abs/2304.09797v5","updated":"2023-08-10T03:41:04Z","published":"2023-04-19T16:29:48Z","title":"Progressive-Hint Prompting Improves Reasoning in Large Language Models","summary":"  The performance of Large Language Models (LLMs) in reasoning tasks depends\nheavily on prompt design, with Chain-of-Thought (CoT) and self-consistency\nbeing critical methods that enhance this ability. However, these methods do not\nfully exploit the answers generated by the LLM to guide subsequent responses.\nThis paper proposes a new prompting method, named Progressive-Hint Prompting\n(PHP), that enables automatic multiple interactions between users and LLMs by\nusing previously generated answers as hints to progressively guide toward the\ncorrect answers. PHP is orthogonal to CoT and self-consistency, making it easy\nto combine with state-of-the-art techniques to further improve performance. We\nconducted extensive and comprehensive experiments on seven benchmarks. The\nresults show that PHP significantly improves accuracy while remaining highly\nefficient. For instance, with text-davinci-003, we observed a 4.2% improvement\non GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction\nin sample paths with self-consistency. With GPT-4 and PHP, we achieve\nstate-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),\nAQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).\n","authors":["Chuanyang Zheng","Zhengying Liu","Enze Xie","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2304.09797v5.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2308.05317v1","updated":"2023-08-10T03:09:12Z","published":"2023-08-10T03:09:12Z","title":"Few-Shot Data-to-Text Generation via Unified Representation and\n  Multi-Source Learning","summary":"  We present a novel approach for structured data-to-text generation that\naddresses the limitations of existing methods that primarily focus on specific\ntypes of structured data. Our proposed method aims to improve performance in\nmulti-task training, zero-shot and few-shot scenarios by providing a unified\nrepresentation that can handle various forms of structured data such as tables,\nknowledge graph triples, and meaning representations. We demonstrate that our\nproposed approach can effectively adapt to new structured forms, and can\nimprove performance in comparison to current methods. For example, our method\nresulted in a 66% improvement in zero-shot BLEU scores when transferring models\ntrained on table inputs to a knowledge graph dataset. Our proposed method is an\nimportant step towards a more general data-to-text generation framework.\n","authors":["Alexander Hanbo Li","Mingyue Shang","Evangelia Spiliopoulou","Jie Ma","Patrick Ng","Zhiguo Wang","Bonan Min","William Wang","Kathleen McKeown","Vittorio Castelli","Dan Roth","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2308.05317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03131v4","updated":"2023-08-10T02:08:04Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n  Reference Diversity in NLG Evaluation","summary":"  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.03131v4.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n  wildfire season","summary":"  Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05269v1","updated":"2023-08-10T01:02:45Z","published":"2023-08-10T01:02:45Z","title":"A Novel Self-training Approach for Low-resource Speech Recognition","summary":"  In this paper, we propose a self-training approach for automatic speech\nrecognition (ASR) for low-resource settings. While self-training approaches\nhave been extensively developed and evaluated for high-resource languages such\nas English, their applications to low-resource languages like Punjabi have been\nlimited, despite the language being spoken by millions globally. The scarcity\nof annotated data has hindered the development of accurate ASR systems,\nespecially for low-resource languages (e.g., Punjabi and M\\=aori languages). To\naddress this issue, we propose an effective self-training approach that\ngenerates highly accurate pseudo-labels for unlabeled low-resource speech. Our\nexperimental analysis demonstrates that our approach significantly improves\nword error rate, achieving a relative improvement of 14.94% compared to a\nbaseline model across four real speech datasets. Further, our proposed approach\nreports the best results on the Common Voice Punjabi dataset.\n","authors":["Satwinder Singh","Feng Hou","Ruili Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05269v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2210.15781v2","updated":"2023-08-10T23:34:35Z","published":"2022-10-27T21:47:30Z","title":"A Compact End-to-End Model with Local and Global Context for Spoken\n  Language Identification","summary":"  We introduce TitaNet-LID, a compact end-to-end neural network for Spoken\nLanguage Identification (LID) that is based on the ContextNet architecture.\nTitaNet-LID employs 1D depth-wise separable convolutions and\nSqueeze-and-Excitation layers to effectively capture local and global context\nwithin an utterance. Despite its small size, TitaNet-LID achieves performance\nsimilar to state-of-the-art models on the VoxLingua107 dataset while being 10\ntimes smaller. Furthermore, it can be easily adapted to new acoustic conditions\nand unseen languages through simple fine-tuning, achieving a state-of-the-art\naccuracy of 88.2% on the FLEURS benchmark. Our model is scalable and can\nachieve a better trade-off between accuracy and speed. TitaNet-LID performs\nwell even on short utterances less than 5s in length, indicating its robustness\nto input length.\n","authors":["Fei Jia","Nithin Rao Koluguri","Jagadeesh Balam","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2210.15781v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.00158v2","updated":"2023-08-10T23:20:03Z","published":"2023-07-31T21:13:30Z","title":"Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI\n  LLM: Is it possible to capture editing distance patterns from historical\n  data?","summary":"  Translation Quality Estimation (TQE) is an important step before deploying\nthe output translation into usage. TQE is also critical in assessing machine\ntranslation (MT) and human translation (HT) quality without seeing the\nreference translations. In this work, we examine if the state-of-the-art large\nlanguage models (LLMs) can be fine-tuned for the TQE task and their capability.\nWe take ChatGPT as one example and approach TQE as a binary classification\ntask. Using English to Italian, German, French, Japanese, Dutch, Portuguese,\nTurkish, and Chinese training corpora, our experimental results show that\nfine-tuned ChatGPT via its API can achieve a relatively high score on\npredicting translation quality, i.e. if the translation needs to be edited, but\nthere is definitely much space to improve the accuracy. English-Italiano\nbilingual Abstract is available in the paper.\n","authors":["Serge Gladkoff","Gleb Erofeev","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.00158v2.pdf","comment":"7 pages, 11 figures, under-review to ItalianNLP-2023"},{"id":"http://arxiv.org/abs/2301.08427v2","updated":"2023-08-10T20:37:20Z","published":"2023-01-20T05:39:26Z","title":"Which Features are Learned by CodeBert: An Empirical Study of the\n  BERT-based Source Code Representation Learning","summary":"  The Bidirectional Encoder Representations from Transformers (BERT) were\nproposed in the natural language process (NLP) and shows promising results.\nRecently researchers applied the BERT to source-code representation learning\nand reported some good news on several downstream tasks. However, in this\npaper, we illustrated that current methods cannot effectively understand the\nlogic of source codes. The representation of source code heavily relies on the\nprogrammer-defined variable and function names. We design and implement a set\nof experiments to demonstrate our conjecture and provide some insights for\nfuture works.\n","authors":["Lan Zhang","Chen Cao","Zhilong Wang","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.08427v2.pdf","comment":"1 table, 2 figures"},{"id":"http://arxiv.org/abs/2308.02080v2","updated":"2023-08-10T18:32:56Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n  Detection","summary":"  Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17316v2","updated":"2023-08-10T18:32:07Z","published":"2022-10-26T21:03:17Z","title":"There is more than one kind of robustness: Fooling Whisper with\n  adversarial examples","summary":"  Whisper is a recent Automatic Speech Recognition (ASR) model displaying\nimpressive robustness to both out-of-distribution inputs and random noise. In\nthis work, we show that this robustness does not carry over to adversarial\nnoise. We show that we can degrade Whisper performance dramatically, or even\ntranscribe a target sentence of our choice, by generating very small input\nperturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling\nthe Whisper language detector we can very easily degrade the performance of\nmultilingual models. These vulnerabilities of a widely popular open-source\nmodel have practical security implications and emphasize the need for\nadversarially robust ASR.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2210.17316v2.pdf","comment":"Accepted at InterSpeech 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.05745v1","updated":"2023-08-10T17:59:46Z","published":"2023-08-10T17:59:46Z","title":"Iterative Reweighted Least Squares Networks With Convergence Guarantees\n  for Solving Inverse Imaging Problems","summary":"  In this work we present a novel optimization strategy for image\nreconstruction tasks under analysis-based image regularization, which promotes\nsparse and/or low-rank solutions in some learned transform domain. We\nparameterize such regularizers using potential functions that correspond to\nweighted extensions of the $\\ell_p^p$-vector and $\\mathcal{S}_p^p$\nSchatten-matrix quasi-norms with $0 < p \\le 1$. Our proposed minimization\nstrategy extends the Iteratively Reweighted Least Squares (IRLS) method,\ntypically used for synthesis-based $\\ell_p$ and $\\mathcal{S}_p$ norm and\nanalysis-based $\\ell_1$ and nuclear norm regularization. We prove that under\nmild conditions our minimization algorithm converges linearly to a stationary\npoint, and we provide an upper bound for its convergence rate. Further, to\nselect the parameters of the regularizers that deliver the best results for the\nproblem at hand, we propose to learn them from training data by formulating the\nsupervised learning process as a stochastic bilevel optimization problem. We\nshow that thanks to the convergence guarantees of our proposed minimization\nstrategy, such optimization can be successfully performed with a\nmemory-efficient implicit back-propagation scheme. We implement our learned\nIRLS variants as recurrent networks and assess their performance on the\nchallenging image reconstruction tasks of non-blind deblurring,\nsuper-resolution and demosaicking. The comparisons against other existing\nlearned reconstruction approaches demonstrate that our overall method is very\ncompetitive and in many cases outperforms existing unrolled networks, whose\nnumber of parameters is orders of magnitude higher than in our case.\n","authors":["Iaroslav Koshelev","Stamatios Lefkimmiatis"],"pdf_url":"https://arxiv.org/pdf/2308.05745v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.10536"},{"id":"http://arxiv.org/abs/2308.05744v1","updated":"2023-08-10T17:59:34Z","published":"2023-08-10T17:59:34Z","title":"PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views\n  with Learnt Shape Programs","summary":"  In this paper, we develop a new method to automatically convert 2D line\ndrawings from three orthographic views into 3D CAD models. Existing methods for\nthis problem reconstruct 3D models by back-projecting the 2D observations into\n3D space while maintaining explicit correspondence between the input and\noutput. Such methods are sensitive to errors and noises in the input, thus\noften fail in practice where the input drawings created by human designers are\nimperfect. To overcome this difficulty, we leverage the attention mechanism in\na Transformer-based sequence generation model to learn flexible mappings\nbetween the input and output. Further, we design shape programs which are\nsuitable for generating the objects of interest to boost the reconstruction\naccuracy and facilitate CAD modeling applications. Experiments on a new\nbenchmark dataset show that our method significantly outperforms existing ones\nwhen the inputs are noisy or incomplete.\n","authors":["Wentao Hu","Jia Zheng","Zixin Zhang","Xiaojun Yuan","Jian Yin","Zihan Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.05744v1.pdf","comment":"To Appear in ICCV 2023. The first three authors contributed equally\n  to this work. The project page is at\n  https://manycore-research.github.io/PlankAssembly"},{"id":"http://arxiv.org/abs/2308.05741v1","updated":"2023-08-10T17:58:02Z","published":"2023-08-10T17:58:02Z","title":"Neural Progressive Meshes","summary":"  The recent proliferation of 3D content that can be consumed on hand-held\ndevices necessitates efficient tools for transmitting large geometric data,\ne.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a\nchallenge to storage as well as transmission bandwidth, and level-of-detail\ntechniques are often used to transmit an asset using an appropriate bandwidth\nbudget. It is especially desirable for these methods to transmit data\nprogressively, improving the quality of the geometry with more data. Our key\ninsight is that the geometric details of 3D meshes often exhibit similar local\npatterns even across different shapes, and thus can be effectively represented\nwith a shared learned generative space. We learn this space using a\nsubdivision-based encoder-decoder architecture trained in advance on a large\ncollection of surfaces. We further observe that additional residual features\ncan be transmitted progressively between intermediate levels of subdivision\nthat enable the client to control the tradeoff between bandwidth cost and\nquality of reconstruction, providing a neural progressive mesh representation.\nWe evaluate our method on a diverse set of complex 3D shapes and demonstrate\nthat it outperforms baselines in terms of compression ratio and reconstruction\nquality.\n","authors":["Yun-Chun Chen","Vladimir G. Kim","Noam Aigerman","Alec Jacobson"],"pdf_url":"https://arxiv.org/pdf/2308.05741v1.pdf","comment":"SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.05739v1","updated":"2023-08-10T17:57:22Z","published":"2023-08-10T17:57:22Z","title":"Zero Grads Ever Given: Learning Local Surrogate Losses for\n  Non-Differentiable Graphics","summary":"  Gradient-based optimization is now ubiquitous across graphics, but\nunfortunately can not be applied to problems with undefined or zero gradients.\nTo circumvent this issue, the loss function can be manually replaced by a\n\"surrogate\" that has similar minima but is differentiable. Our proposed\nframework, ZeroGrads, automates this process by learning a neural approximation\nof the objective function, the surrogate, which in turn can be used to\ndifferentiate through arbitrary black-box graphics pipelines. We train the\nsurrogate on an actively smoothed version of the objective and encourage\nlocality, focusing the surrogate's capacity on what matters at the current\ntraining episode. The fitting is performed online, alongside the parameter\noptimization, and self-supervised, without pre-computed data or pre-trained\nmodels. As sampling the objective is expensive (it requires a full rendering or\nsimulator run), we devise an efficient sampling scheme that allows for\ntractable run-times and competitive performance at little overhead. We\ndemonstrate optimizing diverse non-convex, non-differentiable black-box\nproblems in graphics, such as visibility in rendering, discrete parameter\nspaces in procedural modelling or optimal control in physics-driven animation.\nIn contrast to more traditional algorithms, our approach scales well to higher\ndimensions, which we demonstrate on problems with up to 35k interlinked\nvariables.\n","authors":["Michael Fischer","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2308.05739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05737v1","updated":"2023-08-10T17:57:06Z","published":"2023-08-10T17:57:06Z","title":"Follow Anything: Open-set detection, tracking, and following in\n  real-time","summary":"  Tracking and following objects of interest is critical to several robotics\nuse cases, ranging from industrial automation to logistics and warehousing, to\nhealthcare and security. In this paper, we present a robotic system to detect,\ntrack, and follow any object in real-time. Our approach, dubbed ``follow\nanything'' (FAn), is an open-vocabulary and multimodal model -- it is not\nrestricted to concepts seen at training time and can be applied to novel\nclasses at inference time using text, images, or click queries. Leveraging rich\nvisual descriptors from large-scale pre-trained models (foundation models), FAn\ncan detect and segment objects by matching multimodal queries (text, images,\nclicks) against an input image sequence. These detected and segmented objects\nare tracked across image frames, all while accounting for occlusion and object\nre-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial\nvehicle) and report its ability to seamlessly follow the objects of interest in\na real-time control loop. FAn can be deployed on a laptop with a lightweight\n(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To\nenable rapid adoption, deployment, and extensibility, we open-source all our\ncode on our project webpage at https://github.com/alaamaalouf/FollowAnything .\nWe also encourage the reader the watch our 5-minutes explainer video in this\nhttps://www.youtube.com/watch?v=6Mgt3EPytrw .\n","authors":["Alaa Maalouf","Ninad Jadhav","Krishna Murthy Jatavallabhula","Makram Chahine","Daniel M. Vogt","Robert J. Wood","Antonio Torralba","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2308.05737v1.pdf","comment":"Project webpage: https://github.com/alaamaalouf/FollowAnything\n  Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw"},{"id":"http://arxiv.org/abs/2308.05736v1","updated":"2023-08-10T17:56:53Z","published":"2023-08-10T17:56:53Z","title":"MapTRv2: An End-to-End Framework for Online Vectorized HD Map\n  Construction","summary":"  High-definition (HD) map provides abundant and precise static environmental\ninformation of the driving scene, serving as a fundamental and indispensable\ncomponent for planning in autonomous driving system. In this paper, we present\n\\textbf{Map} \\textbf{TR}ansformer, an end-to-end framework for online\nvectorized HD map construction. We propose a unified permutation-equivalent\nmodeling approach, \\ie, modeling map element as a point set with a group of\nequivalent permutations, which accurately describes the shape of map element\nand stabilizes the learning process. We design a hierarchical query embedding\nscheme to flexibly encode structured map information and perform hierarchical\nbipartite matching for map element learning. To speed up convergence, we\nfurther introduce auxiliary one-to-many matching and dense supervision. The\nproposed method well copes with various map elements with arbitrary shapes. It\nruns at real-time inference speed and achieves state-of-the-art performance on\nboth nuScenes and Argoverse2 datasets. Abundant qualitative results show stable\nand robust map construction quality in complex and various driving scenes. Code\nand more demos are available at \\url{https://github.com/hustvl/MapTR} for\nfacilitating further studies and applications.\n","authors":["Bencheng Liao","Shaoyu Chen","Yunchi Zhang","Bo Jiang","Qian Zhang","Wenyu Liu","Chang Huang","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05736v1.pdf","comment":"Code available at https://github.com/hustvl/MapTR . arXiv admin note:\n  substantial text overlap with arXiv:2208.14437"},{"id":"http://arxiv.org/abs/2308.05733v1","updated":"2023-08-10T17:55:02Z","published":"2023-08-10T17:55:02Z","title":"FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models","summary":"  3D scene reconstruction is a long-standing vision task. Existing approaches\ncan be categorized into geometry-based and learning-based methods. The former\nleverages multi-view geometry but can face catastrophic failures due to the\nreliance on accurate pixel correspondence across views. The latter was\nproffered to mitigate these issues by learning 2D or 3D representation\ndirectly. However, without a large-scale video or 3D training data, it can\nhardly generalize to diverse real-world scenarios due to the presence of tens\nof millions or even billions of optimization parameters in the deep network.\nRecently, robust monocular depth estimation models trained with large-scale\ndatasets have been proven to possess weak 3D geometry prior, but they are\ninsufficient for reconstruction due to the unknown camera parameters, the\naffine-invariant property, and inter-frame inconsistency. Here, we propose a\nnovel test-time optimization approach that can transfer the robustness of\naffine-invariant depth models such as LeReS to challenging diverse scenes while\nensuring inter-frame consistency, with only dozens of parameters to optimize\nper video frame. Specifically, our approach involves freezing the pre-trained\naffine-invariant depth model's depth predictions, rectifying them by optimizing\nthe unknown scale-shift values with a geometric consistency alignment module,\nand employing the resulting scale-consistent depth maps to robustly obtain\ncamera poses and achieve dense scene reconstruction, even in low-texture\nregions. Experiments show that our method achieves state-of-the-art\ncross-dataset reconstruction on five zero-shot testing datasets.\n","authors":["Guangkai Xu","Wei Yin","Hao Chen","Chunhua Shen","Kai Cheng","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05733v1.pdf","comment":"Accepted to ICCV 2023. Project webpage is at:\n  https://aim-uofa.github.io/FrozenRecon/"},{"id":"http://arxiv.org/abs/2308.05731v1","updated":"2023-08-10T17:53:03Z","published":"2023-08-10T17:53:03Z","title":"Rethinking Integration of Prediction and Planning in Deep Learning-Based\n  Automated Driving Systems: A Review","summary":"  Automated driving has the potential to revolutionize personal, public, and\nfreight mobility. Besides the enormous challenge of perception, i.e. accurately\nperceiving the environment using available sensor data, automated driving\ncomprises planning a safe, comfortable, and efficient motion trajectory. To\npromote safety and progress, many works rely on modules that predict the future\nmotion of surrounding traffic. Modular automated driving systems commonly\nhandle prediction and planning as sequential separate tasks. While this\naccounts for the influence of surrounding traffic on the ego-vehicle, it fails\nto anticipate the reactions of traffic participants to the ego-vehicle's\nbehavior. Recent works suggest that integrating prediction and planning in an\ninterdependent joint step is necessary to achieve safe, efficient, and\ncomfortable driving. While various models implement such integrated systems, a\ncomprehensive overview and theoretical understanding of different principles\nare lacking. We systematically review state-of-the-art deep learning-based\nprediction, planning, and integrated prediction and planning models. Different\nfacets of the integration ranging from model architecture and model design to\nbehavioral aspects are considered and related to each other. Moreover, we\ndiscuss the implications, strengths, and limitations of different integration\nmethods. By pointing out research gaps, describing relevant future challenges,\nand highlighting trends in the research field, we identify promising directions\nfor future research.\n","authors":["Steffen Hagedorn","Marcel Hallgarten","Martin Stoll","Alexandru Condurache"],"pdf_url":"https://arxiv.org/pdf/2308.05731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00135v3","updated":"2023-08-10T17:50:49Z","published":"2023-07-22T17:05:47Z","title":"InFusion: Inject and Attention Fusion for Multi Concept Zero-Shot\n  Text-based Video Editing","summary":"  Large text-to-image diffusion models have achieved remarkable success in\ngenerating diverse, high-quality images. Additionally, these models have been\nsuccessfully leveraged to edit input images by just changing the text prompt.\nBut when these models are applied to videos, the main challenge is to ensure\ntemporal consistency and coherence across frames. In this paper, we propose\nInFusion, a framework for zero-shot text-based video editing leveraging large\npre-trained image diffusion models. Our framework specifically supports editing\nof multiple concepts with pixel-level control over diverse concepts mentioned\nin the editing prompt. Specifically, we inject the difference in features\nobtained with source and edit prompts from U-Net residual blocks of decoder\nlayers. When these are combined with injected attention features, it becomes\nfeasible to query the source contents and scale edited concepts along with the\ninjection of unedited parts. The editing is further controlled in a\nfine-grained manner with mask extraction and attention fusion, which cut the\nedited part from the source and paste it into the denoising pipeline for the\nediting prompt. Our framework is a low-cost alternative to one-shot tuned\nmodels for editing since it does not require training. We demonstrated complex\nconcept editing with a generalised image model (Stable Diffusion v1.5) using\nLoRA. Adaptation is compatible with all the existing image diffusion\ntechniques. Extensive experimental results demonstrate the effectiveness of\nexisting methods in rendering high-quality and temporally consistent videos.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.00135v3.pdf","comment":"10 pages, 8 figures, 1 Table, accepted at ICCVW 2023 (ICCV 2023\n  Workshop on AI for Creative Video Editing and Understanding)"},{"id":"http://arxiv.org/abs/2308.05721v1","updated":"2023-08-10T17:37:49Z","published":"2023-08-10T17:37:49Z","title":"Deformable Mixer Transformer with Gating for Multi-Task Learning of\n  Dense Prediction","summary":"  CNNs and Transformers have their own advantages and both have been widely\nused for dense prediction in multi-task learning (MTL). Most of the current\nstudies on MTL solely rely on CNN or Transformer. In this work, we present a\nnovel MTL model by combining both merits of deformable CNN and query-based\nTransformer with shared gating for multi-task learning of dense prediction.\nThis combination may offer a simple and efficient solution owing to its\npowerful and flexible task-specific learning and advantages of lower cost, less\ncomplexity and smaller parameters than the traditional MTL methods. We\nintroduce deformable mixer Transformer with gating (DeMTG), a simple and\neffective encoder-decoder architecture up-to-date that incorporates the\nconvolution and attention mechanism in a unified network for MTL. It is\nexquisitely designed to use advantages of each block, and provide deformable\nand comprehensive features for all tasks from local and global perspective.\nFirst, the deformable mixer encoder contains two types of operators: the\nchannel-aware mixing operator leveraged to allow communication among different\nchannels, and the spatial-aware deformable operator with deformable convolution\napplied to efficiently sample more informative spatial locations. Second, the\ntask-aware gating transformer decoder is used to perform the task-specific\npredictions, in which task interaction block integrated with self-attention is\napplied to capture task interaction features, and the task query block\nintegrated with gating attention is leveraged to select corresponding\ntask-specific features. Further, the experiment results demonstrate that the\nproposed DeMTG uses fewer GFLOPs and significantly outperforms current\nTransformer-based and CNN-based competitive models on a variety of metrics on\nthree dense prediction datasets. Our code and models are available at\nhttps://github.com/yangyangxu0/DeMTG.\n","authors":["Yangyang Xu","Yibo Yang","Bernard Ghanemm","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05721v1.pdf","comment":"Comments: submitted to IJCV; an extension to our previous AAAI 2023\n  paper arXiv:2301.03461"},{"id":"http://arxiv.org/abs/2303.14961v3","updated":"2023-08-10T17:34:48Z","published":"2023-03-27T07:52:58Z","title":"Diffusion Denoised Smoothing for Certified and Adversarial Robust\n  Out-Of-Distribution Detection","summary":"  As the use of machine learning continues to expand, the importance of\nensuring its safety cannot be overstated. A key concern in this regard is the\nability to identify whether a given sample is from the training distribution,\nor is an \"Out-Of-Distribution\" (OOD) sample. In addition, adversaries can\nmanipulate OOD samples in ways that lead a classifier to make a confident\nprediction. In this study, we present a novel approach for certifying the\nrobustness of OOD detection within a $\\ell_2$-norm around the input, regardless\nof network architecture and without the need for specific components or\nadditional training. Further, we improve current techniques for detecting\nadversarial attacks on OOD samples, while providing high levels of certified\nand adversarial robustness on in-distribution samples. The average of all OOD\ndetection metrics on CIFAR10/100 shows an increase of $\\sim 13 \\% / 5\\%$\nrelative to previous approaches.\n","authors":["Nicola Franco","Daniel Korth","Jeanette Miriam Lorenz","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2303.14961v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10608v2","updated":"2023-08-10T17:17:06Z","published":"2023-06-18T17:55:02Z","title":"STHG: Spatial-Temporal Heterogeneous Graph Learning for Advanced\n  Audio-Visual Diarization","summary":"  This report introduces our novel method named STHG for the Audio-Visual\nDiarization task of the Ego4D Challenge 2023. Our key innovation is that we\nmodel all the speakers in a video using a single, unified heterogeneous graph\nlearning framework. Unlike previous approaches that require a separate\ncomponent solely for the camera wearer, STHG can jointly detect the speech\nactivities of all people including the camera wearer. Our final method obtains\n61.1% DER on the test set of Ego4D, which significantly outperforms all the\nbaselines as well as last year's winner. Our submission achieved 1st place in\nthe Ego4D Challenge 2023. We additionally demonstrate that applying the\noff-the-shelf speech recognition system to the diarized speech segments by STHG\nproduces a competitive performance on the Speech Transcription task of this\nchallenge.\n","authors":["Kyle Min"],"pdf_url":"https://arxiv.org/pdf/2306.10608v2.pdf","comment":"Validation report for the Ego4D challenge at CVPR 2023"},{"id":"http://arxiv.org/abs/2210.07764v2","updated":"2023-08-10T17:14:47Z","published":"2022-10-14T12:54:03Z","title":"Intel Labs at Ego4D Challenge 2022: A Better Baseline for Audio-Visual\n  Diarization","summary":"  This report describes our approach for the Audio-Visual Diarization (AVD)\ntask of the Ego4D Challenge 2022. Specifically, we present multiple technical\nimprovements over the official baselines. First, we improve the detection\nperformance of the camera wearer's voice activity by modifying the training\nscheme of its model. Second, we discover that an off-the-shelf voice activity\ndetection model can effectively remove false positives when it is applied\nsolely to the camera wearer's voice activities. Lastly, we show that better\nactive speaker detection leads to a better AVD outcome. Our final method\nobtains 65.9% DER on the test set of Ego4D, which significantly outperforms all\nthe baselines. Our submission achieved 1st place in the Ego4D Challenge 2022.\n","authors":["Kyle Min"],"pdf_url":"https://arxiv.org/pdf/2210.07764v2.pdf","comment":"Validation report for the Ego4D challenge at ECCV 2022"},{"id":"http://arxiv.org/abs/2308.05707v1","updated":"2023-08-10T17:14:07Z","published":"2023-08-10T17:14:07Z","title":"Shadow Datasets, New challenging datasets for Causal Representation\n  Learning","summary":"  Discovering causal relations among semantic factors is an emergent topic in\nrepresentation learning. Most causal representation learning (CRL) methods are\nfully supervised, which is impractical due to costly labeling. To resolve this\nrestriction, weakly supervised CRL methods were introduced. To evaluate CRL\nperformance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and\nCelebA(SMILE), are utilized. However, existing CRL datasets are limited to\nsimple graphs with few generative factors. Thus we propose two new datasets\nwith a larger number of diverse generative factors and more sophisticated\ncausal graphs. In addition, current real datasets, CelebA(BEARD) and\nCelebA(SMILE), the originally proposed causal graphs are not aligned with the\ndataset distributions. Thus, we propose modifications to them.\n","authors":["Jiageng Zhu","Hanchen Xie","Jianhua Wu","Jiazhi Li","Mahyar Khayatkhoei","Mohamed E. Hussein","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2308.05707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05695v1","updated":"2023-08-10T16:57:14Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":"  Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and been used as strong pixel-level\nrepresentation learners. This paper decomposes the interrelation between the\ngenerative capability and representation learning ability inherent in diffusion\nmodels. We present masked diffusion model (MDM), a scalable self-supervised\nrepresentation learner that substitutes the conventional additive Gaussian\nnoise of traditional diffusion with a masking mechanism. Our proposed approach\nconvincingly surpasses prior benchmarks, demonstrating remarkable advancements\nin both medical and natural image semantic segmentation tasks, particularly\nwithin the context of few-shot scenario.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05681v1","updated":"2023-08-10T16:34:20Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n  Recognition with Skeleton-Motion-Informed Gradient","summary":"  Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v2","updated":"2023-08-10T16:23:03Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach roughly\nhuman-level accuracy on ImageNet. Human-level competence is thus achievable for\na fundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v2.pdf","comment":"v2 adds an Appendix containing results with alternative scaling\n  functions; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.05667v1","updated":"2023-08-10T16:10:54Z","published":"2023-08-10T16:10:54Z","title":"2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration\n  between Images and Point Clouds","summary":"  The commonly adopted detect-then-match approach to registration finds\ndifficulties in the cross-modality cases due to the incompatible keypoint\ndetection and inconsistent feature description. We propose, 2D3D-MATR, a\ndetection-free method for accurate and robust registration between images and\npoint clouds. Our method adopts a coarse-to-fine pipeline where it first\ncomputes coarse correspondences between downsampled patches of the input image\nand the point cloud and then extends them to form dense correspondences between\npixels and points within the patch region. The coarse-level patch matching is\nbased on transformer which jointly learns global contextual constraints with\nself-attention and cross-modality correlations with cross-attention. To resolve\nthe scale ambiguity in patch matching, we construct a multi-scale pyramid for\neach image patch and learn to find for each point patch the best matching image\npatch at a proper resolution level. Extensive experiments on two public\nbenchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art\nP2-Net by around $20$ percentage points on inlier ratio and over $10$ points on\nregistration recall. Our code and models are available at\n\\url{https://github.com/minhaolee/2D3DMATR}.\n","authors":["Minhao Li","Zheng Qin","Zhirui Gao","Renjiao Yi","Chengyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.05667v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2301.05221v2","updated":"2023-08-10T16:05:14Z","published":"2023-01-12T18:59:08Z","title":"Open-vocabulary Object Segmentation with Diffusion Models","summary":"  The goal of this paper is to extract the visual-language correspondence from\na pre-trained text-to-image diffusion model, in the form of segmentation map,\ni.e., simultaneously generating images and segmentation masks for the\ncorresponding visual entities described in the text prompt. We make the\nfollowing contributions: (i) we pair the existing Stable Diffusion model with a\nnovel grounding module, that can be trained to align the visual and textual\nembedding space of the diffusion model with only a small number of object\ncategories; (ii) we establish an automatic pipeline for constructing a dataset,\nthat consists of {image, segmentation mask, text prompt} triplets, to train the\nproposed grounding module; (iii) we evaluate the performance of open-vocabulary\ngrounding on images generated from the text-to-image diffusion model and show\nthat the module can well segment the objects of categories beyond seen ones at\ntraining time; (iv) we adopt the augmented diffusion model to build a synthetic\nsemantic segmentation dataset, and show that, training a standard segmentation\nmodel on such dataset demonstrates competitive performance on the zero-shot\nsegmentation(ZS3) benchmark, which opens up new opportunities for adopting the\npowerful diffusion model for discriminative tasks.\n","authors":["Ziyi Li","Qinye Zhou","Xiaoyun Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2301.05221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.06193v3","updated":"2023-08-10T16:03:31Z","published":"2021-12-12T09:57:59Z","title":"GUNNEL: Guided Mixup Augmentation and Multi-View Fusion for Aquatic\n  Animal Segmentation","summary":"  Recent years have witnessed great advances in object segmentation research.\nIn addition to generic objects, aquatic animals have attracted research\nattention. Deep learning-based methods are widely used for aquatic animal\nsegmentation and have achieved promising performance. However, there is a lack\nof challenging datasets for benchmarking. In this work, we build a new dataset\ndubbed Aquatic Animal Species. We also devise a novel GUided mixup augmeNtatioN\nand multi-modEl fusion for aquatic animaL segmentation (GUNNEL) that leverages\nthe advantages of multiple segmentation models to effectively segment aquatic\nanimals and improves the training performance by synthesizing hard samples.\nExtensive experiments demonstrated the superiority of our proposed framework\nover existing state-of-the-art instance segmentation methods. The code is\navailable at https://github.com/lmquan2000/mask-mixup. The dataset is available\nat https://doi.org/10.5281/zenodo.8208877 .\n","authors":["Minh-Quan Le","Trung-Nghia Le","Tam V. Nguyen","Isao Echizen","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2112.06193v3.pdf","comment":"The code is available at https://github.com/lmquan2000/mask-mixup .\n  The dataset is available at https://doi.org/10.5281/zenodo.8208877"},{"id":"http://arxiv.org/abs/2308.05659v1","updated":"2023-08-10T15:58:28Z","published":"2023-08-10T15:58:28Z","title":"AD-CLIP: Adapting Domains in Prompt Space Using CLIP","summary":"  Although deep learning models have shown impressive performance on supervised\nlearning tasks, they often struggle to generalize well when the training\n(source) and test (target) domains differ. Unsupervised domain adaptation (DA)\nhas emerged as a popular solution to this problem. However, current DA\ntechniques rely on visual backbones, which may lack semantic richness. Despite\nthe potential of large-scale vision-language foundation models like CLIP, their\neffectiveness for DA has yet to be fully explored. To address this gap, we\nintroduce AD-CLIP, a domain-agnostic prompt learning strategy for CLIP that\naims to solve the DA problem in the prompt space. We leverage the frozen vision\nbackbone of CLIP to extract both image style (domain) and content information,\nwhich we apply to learn prompt tokens. Our prompts are designed to be\ndomain-invariant and class-generalizable, by conditioning prompt learning on\nimage style and content features simultaneously. We use standard supervised\ncontrastive learning in the source domain, while proposing an entropy\nminimization strategy to align domains in the embedding space given the target\ndomain data. We also consider a scenario where only target domain samples are\navailable during testing, without any source domain data, and propose a\ncross-domain style mapping network to hallucinate domain-agnostic tokens. Our\nextensive experiments on three benchmark DA datasets demonstrate the\neffectiveness of AD-CLIP compared to existing literature.\n","authors":["Mainak Singha","Harsh Pal","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2308.05659v1.pdf","comment":"10 pages, 8 figures, 4 tables. Accepted at OOD-CV, ICCV Workshop,\n  2023"},{"id":"http://arxiv.org/abs/2308.05655v1","updated":"2023-08-10T15:53:35Z","published":"2023-08-10T15:53:35Z","title":"Attention-based 3D CNN with Multi-layer Features for Alzheimer's Disease\n  Diagnosis using Brain Images","summary":"  Structural MRI and PET imaging play an important role in the diagnosis of\nAlzheimer's disease (AD), showing the morphological changes and glucose\nmetabolism changes in the brain respectively. The manifestations in the brain\nimage of some cognitive impairment patients are relatively inconspicuous, for\nexample, it still has difficulties in achieving accurate diagnosis through sMRI\nin clinical practice. With the emergence of deep learning, convolutional neural\nnetwork (CNN) has become a valuable method in AD-aided diagnosis, but some CNN\nmethods cannot effectively learn the features of brain image, making the\ndiagnosis of AD still presents some challenges. In this work, we propose an\nend-to-end 3D CNN framework for AD diagnosis based on ResNet, which integrates\nmulti-layer features obtained under the effect of the attention mechanism to\nbetter capture subtle differences in brain images. The attention maps showed\nour model can focus on key brain regions related to the disease diagnosis. Our\nmethod was verified in ablation experiments with two modality images on 792\nsubjects from the ADNI database, where AD diagnostic accuracies of 89.71% and\n91.18% were achieved based on sMRI and PET respectively, and also outperformed\nsome state-of-the-art methods.\n","authors":["Yanteng Zhang","Qizhi Teng","Xiaohai He","Tong Niu","Lipei Zhang","Yan Liu","Chao Ren"],"pdf_url":"https://arxiv.org/pdf/2308.05655v1.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.05648v1","updated":"2023-08-10T15:45:45Z","published":"2023-08-10T15:45:45Z","title":"Counterfactual Cross-modality Reasoning for Weakly Supervised Video\n  Moment Localization","summary":"  Video moment localization aims to retrieve the target segment of an untrimmed\nvideo according to the natural language query. Weakly supervised methods gains\nattention recently, as the precise temporal location of the target segment is\nnot always available. However, one of the greatest challenges encountered by\nthe weakly supervised method is implied in the mismatch between the video and\nlanguage induced by the coarse temporal annotations. To refine the\nvision-language alignment, recent works contrast the cross-modality\nsimilarities driven by reconstructing masked queries between positive and\nnegative video proposals. However, the reconstruction may be influenced by the\nlatent spurious correlation between the unmasked and the masked parts, which\ndistorts the restoring process and further degrades the efficacy of contrastive\nlearning since the masked words are not completely reconstructed from the\ncross-modality knowledge. In this paper, we discover and mitigate this spurious\ncorrelation through a novel proposed counterfactual cross-modality reasoning\nmethod. Specifically, we first formulate query reconstruction as an aggregated\ncausal effect of cross-modality and query knowledge. Then by introducing\ncounterfactual cross-modality knowledge into this aggregation, the spurious\nimpact of the unmasked part contributing to the reconstruction is explicitly\nmodeled. Finally, by suppressing the unimodal effect of masked query, we can\nrectify the reconstructions of video proposals to perform reasonable\ncontrastive learning. Extensive experimental evaluations demonstrate the\neffectiveness of our proposed method. The code is available at\n\\href{https://github.com/sLdZ0306/CCR}{https://github.com/sLdZ0306/CCR}.\n","authors":["Zezhong Lv","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05648v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2112.02399v3","updated":"2023-08-10T15:31:54Z","published":"2021-12-04T18:34:24Z","title":"VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts","summary":"  Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention\nrecently for its transferable visual representation learning. However, due to\nthe semantic gap within datasets, CLIP's pre-trained image-text alignment\nbecomes sub-optimal on downstream tasks, which severely harms its transferring\nperformance. To better adapt the cross-modality embedding space, we propose to\nenhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide\ntextual features of different categories to adaptively explore informative\nregions on the image and aggregate visual features by attention mechanisms. In\nthis way, the texts become visual-guided, namely, more semantically correlated\nwith downstream images, which greatly benefits the category-wise matching\nprocess. In few-shot settings, we evaluate our VT-CLIP on 11 well-known\nclassification datasets to demonstrate its effectiveness.\n","authors":["Longtian Qiu","Renrui Zhang","Ziyu Guo","Ziyao Zeng","Zilu Guo","Yafeng Li","Guangnan Zhang"],"pdf_url":"https://arxiv.org/pdf/2112.02399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n  Transformer","summary":"  Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04278v2","updated":"2023-08-10T15:19:34Z","published":"2022-09-09T12:47:24Z","title":"Deep learning-based Crop Row Detection for Infield Navigation of\n  Agri-Robots","summary":"  Autonomous navigation in agricultural environments is challenged by varying\nfield conditions that arise in arable fields. State-of-the-art solutions for\nautonomous navigation in such environments require expensive hardware such as\nRTK-GNSS. This paper presents a robust crop row detection algorithm that\nwithstands such field variations using inexpensive cameras. Existing datasets\nfor crop row detection does not represent all the possible field variations. A\ndataset of sugar beet images was created representing 11 field variations\ncomprised of multiple grow stages, light levels, varying weed densities, curved\ncrop rows and discontinuous crop rows. The proposed pipeline segments the crop\nrows using a deep learning-based method and employs the predicted segmentation\nmask for extraction of the central crop using a novel central crop row\nselection algorithm. The novel crop row detection algorithm was tested for crop\nrow detection performance and the capability of visual servoing along a crop\nrow. The visual servoing-based navigation was tested on a realistic simulation\nscenario with the real ground and plant textures. Our algorithm demonstrated\nrobust vision-based crop row detection in challenging field conditions\noutperforming the baseline.\n","authors":["Rajitha de Silva","Grzegorz Cielniak","Gang Wang","Junfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2209.04278v2.pdf","comment":"Published in Journal of Field Robotics:\n  https://onlinelibrary.wiley.com/doi/epdf/10.1002/rob.22238"},{"id":"http://arxiv.org/abs/2304.10769v4","updated":"2023-08-10T14:46:15Z","published":"2023-04-21T06:35:54Z","title":"Deep Multiview Clustering by Contrasting Cluster Assignments","summary":"  Multiview clustering (MVC) aims to reveal the underlying structure of\nmultiview data by categorizing data samples into clusters. Deep learning-based\nmethods exhibit strong feature learning capabilities on large-scale datasets.\nFor most existing deep MVC methods, exploring the invariant representations of\nmultiple views is still an intractable problem. In this paper, we propose a\ncross-view contrastive learning (CVCL) method that learns view-invariant\nrepresentations and produces clustering results by contrasting the cluster\nassignments among multiple views. Specifically, we first employ deep\nautoencoders to extract view-dependent features in the pretraining stage. Then,\na cluster-level CVCL strategy is presented to explore consistent semantic label\ninformation among the multiple views in the fine-tuning stage. Thus, the\nproposed CVCL method is able to produce more discriminative cluster assignments\nby virtue of this learning strategy. Moreover, we provide a theoretical\nanalysis of soft cluster assignment alignment. Extensive experimental results\nobtained on several datasets demonstrate that the proposed CVCL method\noutperforms several state-of-the-art approaches.\n","authors":["Jie Chen","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2304.10769v4.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05605v1","updated":"2023-08-10T14:32:18Z","published":"2023-08-10T14:32:18Z","title":"Self-Supervised Monocular Depth Estimation by Direction-aware Cumulative\n  Convolution Network","summary":"  Monocular depth estimation is known as an ill-posed task in which objects in\na 2D image usually do not contain sufficient information to predict their\ndepth. Thus, it acts differently from other tasks (e.g., classification and\nsegmentation) in many ways. In this paper, we find that self-supervised\nmonocular depth estimation shows a direction sensitivity and environmental\ndependency in the feature representation. But the current backbones borrowed\nfrom other tasks pay less attention to handling different types of\nenvironmental information, limiting the overall depth accuracy. To bridge this\ngap, we propose a new Direction-aware Cumulative Convolution Network (DaCCN),\nwhich improves the depth feature representation in two aspects. First, we\npropose a direction-aware module, which can learn to adjust the feature\nextraction in each direction, facilitating the encoding of different types of\ninformation. Secondly, we design a new cumulative convolution to improve the\nefficiency for aggregating important environmental information. Experiments\nshow that our method achieves significant improvements on three widely used\nbenchmarks, KITTI, Cityscapes, and Make3D, setting a new state-of-the-art\nperformance on the popular benchmarks with all three types of self-supervision.\n","authors":["Wencheng Han","Junbo Yin","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2308.05605v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.05602v1","updated":"2023-08-10T14:21:33Z","published":"2023-08-10T14:21:33Z","title":"Object Goal Navigation with Recursive Implicit Maps","summary":"  Object goal navigation aims to navigate an agent to locations of a given\nobject category in unseen environments. Classical methods explicitly build maps\nof environments and require extensive engineering while lacking semantic\ninformation for object-oriented exploration. On the other hand, end-to-end\nlearning methods alleviate manual map design and predict actions using implicit\nrepresentations. Such methods, however, lack an explicit notion of geometry and\nmay have limited ability to encode navigation history. In this work, we propose\nan implicit spatial map for object goal navigation. Our implicit map is\nrecursively updated with new observations at each step using a transformer. To\nencourage spatial reasoning, we introduce auxiliary tasks and train our model\nto reconstruct explicit maps as well as to predict visual features, semantic\nlabels and actions. Our method significantly outperforms the state of the art\non the challenging MP3D dataset and generalizes well to the HM3D dataset. We\nsuccessfully deploy our model on a real robot and achieve encouraging object\ngoal navigation results in real scenes using only a few real-world\ndemonstrations. Code, trained models and videos are available at\n\\url{https://www.di.ens.fr/willow/research/onav_rim/}.\n","authors":["Shizhe Chen","Thomas Chabal","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.05602v1.pdf","comment":"Accepted to IROS 2023"},{"id":"http://arxiv.org/abs/2308.05600v1","updated":"2023-08-10T14:19:58Z","published":"2023-08-10T14:19:58Z","title":"NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search","summary":"  Deep neural network (DNN) deployment has been confined to larger hardware\ndevices due to their expensive computational requirements. This challenge has\nrecently reached another scale with the emergence of large language models\n(LLMs). In order to reduce both their memory footprint and latency, a promising\ntechnique is quantization. It consists in converting floating point\nrepresentations to low bit-width fixed point representations, usually by\nassuming a uniform mapping onto a regular grid. This process, referred to in\nthe literature as uniform quantization, may however be ill-suited as most DNN\nweights and activations follow a bell-shaped distribution. This is even worse\non LLMs whose weight distributions are known to exhibit large, high impact,\noutlier values. In this work, we propose an improvement over the most commonly\nadopted way to tackle this limitation in deep learning models quantization,\nnamely, non-uniform quantization. NUPES leverages automorphisms to preserve the\nscalar multiplications. Such transformations are derived from power functions.\nHowever, the optimization of the exponent parameter and weight values remains a\nchallenging and novel problem which could not be solved with previous post\ntraining optimization techniques which only learn to round up or down weight\nvalues in order to preserve the predictive function. We circumvent this\nlimitation with a new paradigm: learning new quantized weights over the entire\nquantized space. Similarly, we enable the optimization of the power exponent,\ni.e. the optimization of the quantization operator itself during training by\nalleviating all the numerical instabilities. The resulting predictive function\nis compatible with integer-only low-bit inference. We show the ability of the\nmethod to achieve state-of-the-art compression rates in both, data-free and\ndata-driven configurations.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.05600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05595v1","updated":"2023-08-10T14:08:50Z","published":"2023-08-10T14:08:50Z","title":"Test-Time Selection for Robust Skin Lesion Analysis","summary":"  Skin lesion analysis models are biased by artifacts placed during image\nacquisition, which influence model predictions despite carrying no clinical\ninformation. Solutions that address this problem by regularizing models to\nprevent learning those spurious features achieve only partial success, and\nexisting test-time debiasing techniques are inappropriate for skin lesion\nanalysis due to either making unrealistic assumptions on the distribution of\ntest data or requiring laborious annotation from medical practitioners. We\npropose TTS (Test-Time Selection), a human-in-the-loop method that leverages\npositive (e.g., lesion area) and negative (e.g., artifacts) keypoints in test\nsamples. TTS effectively steers models away from exploiting spurious\nartifact-related correlations without retraining, and with less annotation\nrequirements. Our solution is robust to a varying availability of annotations,\nand different levels of bias. We showcase on the ISIC2019 dataset (for which we\nrelease a subset of annotated images) how our model could be deployed in the\nreal-world for mitigating bias.\n","authors":["Alceu Bissoto","Catarina Barata","Eduardo Valle","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2308.05595v1.pdf","comment":"Accepted at ISIC Workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.05581v1","updated":"2023-08-10T13:44:54Z","published":"2023-08-10T13:44:54Z","title":"Category Feature Transformer for Semantic Segmentation","summary":"  Aggregation of multi-stage features has been revealed to play a significant\nrole in semantic segmentation. Unlike previous methods employing point-wise\nsummation or concatenation for feature aggregation, this study proposes the\nCategory Feature Transformer (CFT) that explores the flow of category embedding\nand transformation among multi-stage features through the prevalent multi-head\nattention mechanism. CFT learns unified feature embeddings for individual\nsemantic categories from high-level features during each aggregation process\nand dynamically broadcasts them to high-resolution features. Integrating the\nproposed CFT into a typical feature pyramid structure exhibits superior\nperformance over a broad range of backbone networks. We conduct extensive\nexperiments on popular semantic segmentation benchmarks. Specifically, the\nproposed CFT obtains a compelling 55.1% mIoU with greatly reduced model\nparameters and computations on the challenging ADE20K dataset.\n","authors":["Quan Tang","Chuanjian Liu","Fagui Liu","Yifan Liu","Jun Jiang","Bowen Zhang","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05574v1","updated":"2023-08-10T13:38:09Z","published":"2023-08-10T13:38:09Z","title":"Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual\n  Translation of Dravidian Languages","summary":"  Current research in zero-shot translation is plagued by several issues such\nas high compute requirements, increased training time and off target\ntranslations. Proposed remedies often come at the cost of additional data or\ncompute requirements. Pivot based neural machine translation is preferred over\na single-encoder model for most settings despite the increased training and\nevaluation time. In this work, we overcome the shortcomings of zero-shot\ntranslation by taking advantage of transliteration and linguistic similarity.\nWe build a single encoder-decoder neural machine translation system for\nDravidian-Dravidian multilingual translation and perform zero-shot translation.\nWe compare the data vs zero-shot accuracy tradeoff and evaluate the performance\nof our vanilla method against the current state of the art pivot based method.\nWe also test the theory that morphologically rich languages require large\nvocabularies by restricting the vocabulary using an optimal transport based\ntechnique. Our model manages to achieves scores within 3 BLEU of large-scale\npivot-based models when it is trained on 50\\% of the language directions.\n","authors":["Danish Ebadulla","Rahul Raman","S. Natarajan","Hridhay Kiran Shetty","Ashish Harish Shenoy"],"pdf_url":"https://arxiv.org/pdf/2308.05574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05550v1","updated":"2023-08-10T13:06:05Z","published":"2023-08-10T13:06:05Z","title":"Cross-Domain Product Representation Learning for Rich-Content E-Commerce","summary":"  The proliferation of short video and live-streaming platforms has\nrevolutionized how consumers engage in online shopping. Instead of browsing\nproduct pages, consumers are now turning to rich-content e-commerce, where they\ncan purchase products through dynamic and interactive media like short videos\nand live streams. This emerging form of online shopping has introduced\ntechnical challenges, as products may be presented differently across various\nmedia domains. Therefore, a unified product representation is essential for\nachieving cross-domain product recognition to ensure an optimal user search\nexperience and effective product recommendations. Despite the urgent industrial\nneed for a unified cross-domain product representation, previous studies have\npredominantly focused only on product pages without taking into account short\nvideos and live streams. To fill the gap in the rich-content e-commerce area,\nin this paper, we introduce a large-scale cRoss-dOmain Product Ecognition\ndataset, called ROPE. ROPE covers a wide range of product categories and\ncontains over 180,000 products, corresponding to millions of short videos and\nlive streams. It is the first dataset to cover product pages, short videos, and\nlive streams simultaneously, providing the basis for establishing a unified\nproduct representation across different media domains. Furthermore, we propose\na Cross-dOmain Product rEpresentation framework, namely COPE, which unifies\nproduct representations in different domains through multimodal learning\nincluding text and vision. Extensive experiments on downstream tasks\ndemonstrate the effectiveness of COPE in learning a joint feature space for all\nproduct domains.\n","authors":["Xuehan Bai","Yan Li","Yanhua Cheng","Wenjie Yang","Quan Chen","Han Li"],"pdf_url":"https://arxiv.org/pdf/2308.05550v1.pdf","comment":"ICCV23"},{"id":"http://arxiv.org/abs/2308.05543v1","updated":"2023-08-10T12:53:30Z","published":"2023-08-10T12:53:30Z","title":"Deep Richardson-Lucy Deconvolution for Low-Light Image Deblurring","summary":"  Images taken under the low-light condition often contain blur and saturated\npixels at the same time. Deblurring images with saturated pixels is quite\nchallenging. Because of the limited dynamic range, the saturated pixels are\nusually clipped in the imaging process and thus cannot be modeled by the linear\nblur model. Previous methods use manually designed smooth functions to\napproximate the clipping procedure. Their deblurring processes often require\nempirically defined parameters, which may not be the optimal choices for\ndifferent images. In this paper, we develop a data-driven approach to model the\nsaturated pixels by a learned latent map. Based on the new model, the non-blind\ndeblurring task can be formulated into a maximum a posterior (MAP) problem,\nwhich can be effectively solved by iteratively computing the latent map and the\nlatent image. Specifically, the latent map is computed by learning from a map\nestimation network (MEN), and the latent image estimation process is\nimplemented by a Richardson-Lucy (RL)-based updating scheme. To estimate\nhigh-quality deblurred images without amplified artifacts, we develop a prior\nestimation network (PEN) to obtain prior information, which is further\nintegrated into the RL scheme. Experimental results demonstrate that the\nproposed method performs favorably against state-of-the-art algorithms both\nquantitatively and qualitatively on synthetic and real-world images.\n","authors":["Liang Chen","Jiawei Zhang","Zhenhua Li","Yunxuan Wei","Faming Fang","Jimmy Ren","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2308.05543v1.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2210.04087v3","updated":"2023-08-10T12:42:06Z","published":"2022-10-08T18:49:58Z","title":"Symmetry Defense Against CNN Adversarial Perturbation Attacks","summary":"  This paper uses symmetry to make Convolutional Neural Network classifiers\n(CNNs) robust against adversarial perturbation attacks. Such attacks add\nperturbation to original images to generate adversarial images that fool\nclassifiers such as road sign classifiers of autonomous vehicles. Although\nsymmetry is a pervasive aspect of the natural world, CNNs are unable to handle\nsymmetry well. For example, a CNN can classify an image differently from its\nmirror image. For an adversarial image that misclassifies with a wrong label\n$l_w$, CNN inability to handle symmetry means that a symmetric adversarial\nimage can classify differently from the wrong label $l_w$. Further than that,\nwe find that the classification of a symmetric adversarial image reverts to the\ncorrect label. To classify an image when adversaries are unaware of the\ndefense, we apply symmetry to the image and use the classification label of the\nsymmetric image. To classify an image when adversaries are aware of the\ndefense, we use mirror symmetry and pixel inversion symmetry to form a symmetry\ngroup. We apply all the group symmetries to the image and decide on the output\nlabel based on the agreement of any two of the classification labels of the\nsymmetry images. Adaptive attacks fail because they need to rely on loss\nfunctions that use conflicting CNN output values for symmetric images. Without\nattack knowledge, the proposed symmetry defense succeeds against both\ngradient-based and random-search attacks, with up to near-default accuracies\nfor ImageNet. The defense even improves the classification accuracy of original\nimages.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2210.04087v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.05542v1","updated":"2023-08-10T12:41:08Z","published":"2023-08-10T12:41:08Z","title":"Robust Asymmetric Loss for Multi-Label Long-Tailed Learning","summary":"  In real medical data, training samples typically show long-tailed\ndistributions with multiple labels. Class distribution of the medical data has\na long-tailed shape, in which the incidence of different diseases is quite\nvaried, and at the same time, it is not unusual for images taken from\nsymptomatic patients to be multi-label diseases. Therefore, in this paper, we\nconcurrently address these two issues by putting forth a robust asymmetric loss\non the polynomial function. Since our loss tackles both long-tailed and\nmulti-label classification problems simultaneously, it leads to a complex\ndesign of the loss function with a large number of hyper-parameters. Although a\nmodel can be highly fine-tuned due to a large number of hyper-parameters, it is\ndifficult to optimize all hyper-parameters at the same time, and there might be\na risk of overfitting a model. Therefore, we regularize the loss function using\nthe Hill loss approach, which is beneficial to be less sensitive against the\nnumerous hyper-parameters so that it reduces the risk of overfitting the model.\nFor this reason, the proposed loss is a generic method that can be applied to\nmost medical image classification tasks and does not make the training process\nmore time-consuming. We demonstrate that the proposed robust asymmetric loss\nperforms favorably against the long-tailed with multi-label medical image\nclassification in addition to the various long-tailed single-label datasets.\nNotably, our method achieves Top-5 results on the CXR-LT dataset of the ICCV\nCVAMD 2023 competition. We opensource our implementation of the robust\nasymmetric loss in the public repository: https://github.com/kalelpark/RAL.\n","authors":["Wongi Park","Inhyuk Park","Sungeun Kim","Jongbin Ryu"],"pdf_url":"https://arxiv.org/pdf/2308.05542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05533v1","updated":"2023-08-10T12:23:47Z","published":"2023-08-10T12:23:47Z","title":"Is there progress in activity progress prediction?","summary":"  Activity progress prediction aims to estimate what percentage of an activity\nhas been completed. Currently this is done with machine learning approaches,\ntrained and evaluated on complicated and realistic video datasets. The videos\nin these datasets vary drastically in length and appearance. And some of the\nactivities have unanticipated developments, making activity progression\ndifficult to estimate. In this work, we examine the results obtained by\nexisting progress prediction methods on these datasets. We find that current\nprogress prediction methods seem not to extract useful visual information for\nthe progress prediction task. Therefore, these methods fail to exceed simple\nframe-counting baselines. We design a precisely controlled dataset for activity\nprogress prediction and on this synthetic dataset we show that the considered\nmethods can make use of the visual information, when this directly relates to\nthe progress prediction. We conclude that the progress prediction task is\nill-posed on the currently used real-world datasets. Moreover, to fairly\nmeasure activity progression we advise to consider a, simple but effective,\nframe-counting baseline.\n","authors":["Frans de Boer","Jan C. van Gemert","Jouke Dijkstra","Silvia L. Pintea"],"pdf_url":"https://arxiv.org/pdf/2308.05533v1.pdf","comment":"Accepted at ICCVw-2023 (AI for Creative Video Editing and\n  Understanding, ICCV workshop 2023)"},{"id":"http://arxiv.org/abs/2308.05525v1","updated":"2023-08-10T12:06:03Z","published":"2023-08-10T12:06:03Z","title":"Critical Points ++: An Agile Point Cloud Importance Measure for Robust\n  Classification, Adversarial Defense and Explainable AI","summary":"  The ability to cope accurately and fast with Out-Of-Distribution (OOD)\nsamples is crucial in real-world safety demanding applications. In this work we\nfirst study the interplay between critical points of 3D point clouds and OOD\nsamples. Our findings are that common corruptions and outliers are often\ninterpreted as critical points. We generalize the notion of critical points\ninto importance measures. We show that training a classification network based\nonly on less important points dramatically improves robustness, at a cost of\nminor performance loss on the clean set. We observe that normalized entropy is\nhighly informative for corruption analysis. An adaptive threshold based on\nnormalized entropy is suggested for selecting the set of uncritical points. Our\nproposed importance measure is extremely fast to compute. We show it can be\nused for a variety of applications, such as Explainable AI (XAI), Outlier\nRemoval, Uncertainty Estimation, Robust Classification and Adversarial Defense.\nWe reach SOTA results on the two latter tasks.\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2308.05525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v4","updated":"2023-08-10T11:54:53Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models with Explicit Transition Probability","summary":"  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10816v3","updated":"2023-08-10T11:54:46Z","published":"2023-07-20T12:25:06Z","title":"BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained\n  Diffusion","summary":"  Recent text-to-image diffusion models have demonstrated an astonishing\ncapacity to generate high-quality images. However, researchers mainly studied\nthe way of synthesizing images with only text prompts. While some works have\nexplored using other modalities as conditions, considerable paired data, e.g.,\nbox/mask-image pairs, and fine-tuning time are required for nurturing models.\nAs such paired data is time-consuming and labor-intensive to acquire and\nrestricted to a closed set, this potentially becomes the bottleneck for\napplications in an open world. This paper focuses on the simplest form of\nuser-provided conditions, e.g., box or scribble. To mitigate the aforementioned\nproblem, we propose a training-free method to control objects and contexts in\nthe synthesized images adhering to the given spatial conditions. Specifically,\nthree spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,\nare designed and seamlessly integrated into the denoising step of diffusion\nmodels, requiring no additional training and massive annotated layout data.\nExtensive results show that the proposed constraints can control what and where\nto present in the images while retaining the ability of the Stable Diffusion\nmodel to synthesize with high fidelity and diverse concept coverage. The code\nis publicly available at https://github.com/Sierkinhane/BoxDiff.\n","authors":["Jinheng Xie","Yuexiang Li","Yawen Huang","Haozhe Liu","Wentian Zhang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.10816v3.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n  https://github.com/Sierkinhane/BoxDiff"},{"id":"http://arxiv.org/abs/2308.04868v2","updated":"2023-08-10T11:27:16Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":"  Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Gil Triginer","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07944v2","updated":"2023-08-10T11:05:23Z","published":"2023-07-16T04:34:11Z","title":"Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and\n  Class-balanced Pseudo-Labeling","summary":"  Unsupervised domain adaptation (DA) with the aid of pseudo labeling\ntechniques has emerged as a crucial approach for domain-adaptive 3D object\ndetection. While effective, existing DA methods suffer from a substantial drop\nin performance when applied to a multi-class training setting, due to the\nco-existence of low-quality pseudo labels and class imbalance issues. In this\npaper, we address this challenge by proposing a novel ReDB framework tailored\nfor learning to detect all classes at once. Our approach produces Reliable,\nDiverse, and class-Balanced pseudo 3D boxes to iteratively guide the\nself-training on a distributionally different target domain. To alleviate\ndisruptions caused by the environmental discrepancy (e.g., beam numbers), the\nproposed cross-domain examination (CDE) assesses the correctness of pseudo\nlabels by copy-pasting target instances into a source environment and measuring\nthe prediction consistency. To reduce computational overhead and mitigate the\nobject shift (e.g., scales and point densities), we design an overlapped boxes\ncounting (OBC) metric that allows to uniformly downsample pseudo-labeled\nobjects across different geometric characteristics. To confront the issue of\ninter-class imbalance, we progressively augment the target point clouds with a\nclass-balanced set of pseudo-labeled target instances and source objects, which\nboosts recognition accuracies on both frequently appearing and rare classes.\nExperimental results on three benchmark datasets using both voxel-based (i.e.,\nSECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our\nproposed ReDB approach outperforms existing 3D domain adaptation methods by a\nlarge margin, improving 23.15% mAP on the nuScenes $\\rightarrow$ KITTI task.\nThe code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.\n","authors":["Zhuoxiao Chen","Yadan Luo","Zheng Wang","Mahsa Baktashmotlagh","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07944v2.pdf","comment":"Accepted by ICCV 2023, camera-ready"},{"id":"http://arxiv.org/abs/2202.03026v3","updated":"2023-08-10T11:01:14Z","published":"2022-02-07T09:33:45Z","title":"Context Autoencoder for Self-Supervised Representation Learning","summary":"  We present a novel masked image modeling (MIM) approach, context autoencoder\n(CAE), for self-supervised representation pretraining. We pretrain an encoder\nby making predictions in the encoded representation space. The pretraining\ntasks include two tasks: masked representation prediction - predict the\nrepresentations for the masked patches, and masked patch reconstruction -\nreconstruct the masked patches. The network is an encoder-regressor-decoder\narchitecture: the encoder takes the visible patches as input; the regressor\npredicts the representations of the masked patches, which are expected to be\naligned with the representations computed from the encoder, using the\nrepresentations of visible patches and the positions of visible and masked\npatches; the decoder reconstructs the masked patches from the predicted encoded\nrepresentations. The CAE design encourages the separation of learning the\nencoder (representation) from completing the pertaining tasks: masked\nrepresentation prediction and masked patch reconstruction tasks, and making\npredictions in the encoded representation space empirically shows the benefit\nto representation learning. We demonstrate the effectiveness of our CAE through\nsuperior transfer performance in downstream tasks: semantic segmentation,\nobject detection and instance segmentation, and classification. The code will\nbe available at https://github.com/Atten4Vis/CAE.\n","authors":["Xiaokang Chen","Mingyu Ding","Xiaodi Wang","Ying Xin","Shentong Mo","Yunhao Wang","Shumin Han","Ping Luo","Gang Zeng","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2202.03026v3.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV)"},{"id":"http://arxiv.org/abs/2308.05493v1","updated":"2023-08-10T10:47:12Z","published":"2023-08-10T10:47:12Z","title":"Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation\n  for Panoramic Semantic Segmentation","summary":"  Endeavors have been recently made to transfer knowledge from the labeled\npinhole image domain to the unlabeled panoramic image domain via Unsupervised\nDomain Adaptation (UDA). The aim is to tackle the domain gaps caused by the\nstyle disparities and distortion problem from the non-uniformly distributed\npixels of equirectangular projection (ERP). Previous works typically focus on\ntransferring knowledge based on geometric priors with specially designed\nmulti-branch network architectures. As a result, considerable computational\ncosts are induced, and meanwhile, their generalization abilities are profoundly\nhindered by the variation of distortion among pixels. In this paper, we find\nthat the pixels' neighborhood regions of the ERP indeed introduce less\ndistortion. Intuitively, we propose a novel UDA framework that can effectively\naddress the distortion problems for panoramic semantic segmentation. In\ncomparison, our method is simpler, easier to implement, and more\ncomputationally efficient. Specifically, we propose distortion-aware attention\n(DA) capturing the neighboring pixel distribution without using any geometric\nconstraints. Moreover, we propose a class-wise feature aggregation (CFA) module\nto iteratively update the feature representations with a memory bank. As such,\nthe feature similarity between two domains can be consistently optimized.\nExtensive experiments show that our method achieves new state-of-the-art\nperformance while remarkably reducing 80% parameters.\n","authors":["Xu Zheng","Tianbo Pan","Yunhao Luo","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05493v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04995v2","updated":"2023-08-10T10:43:53Z","published":"2023-08-09T14:48:31Z","title":"IDiff-Face: Synthetic-based Face Recognition through Fizzy\n  Identity-Conditioned Diffusion Models","summary":"  The availability of large-scale authentic face databases has been crucial to\nthe significant advances made in face recognition research over the past\ndecade. However, legal and ethical concerns led to the recent retraction of\nmany of these databases by their creators, raising questions about the\ncontinuity of future face recognition research without one of its key\nresources. Synthetic datasets have emerged as a promising alternative to\nprivacy-sensitive authentic data for face recognition development. However,\nrecent synthetic datasets that are used to train face recognition models suffer\neither from limitations in intra-class diversity or cross-class (identity)\ndiscrimination, leading to less optimal accuracies, far away from the\naccuracies achieved by models trained on authentic data. This paper targets\nthis issue by proposing IDiff-Face, a novel approach based on conditional\nlatent diffusion models for synthetic identity generation with realistic\nidentity variations for face recognition training. Through extensive\nevaluations, our proposed synthetic-based face recognition approach pushed the\nlimits of state-of-the-art performances, achieving, for example, 98.00%\naccuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the\nrecent synthetic-based face recognition solutions with 95.40% and bridging the\ngap to authentic-based face recognition with 99.82% accuracy.\n","authors":["Fadi Boutros","Jonas Henry Grebe","Arjan Kuijper","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2308.04995v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.05480v1","updated":"2023-08-10T10:12:27Z","published":"2023-08-10T10:12:27Z","title":"YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-time\n  Object Detection","summary":"  We aim at providing the object detection community with an efficient and\nperformant object detector, termed YOLO-MS. The core design is based on a\nseries of investigations on how convolutions with different kernel sizes affect\nthe detection performance of objects at different scales. The outcome is a new\nstrategy that can strongly enhance multi-scale feature representations of\nreal-time object detectors. To verify the effectiveness of our strategy, we\nbuild a network architecture, termed YOLO-MS. We train our YOLO-MS on the MS\nCOCO dataset from scratch without relying on any other large-scale datasets,\nlike ImageNet, or pre-trained weights. Without bells and whistles, our YOLO-MS\noutperforms the recent state-of-the-art real-time object detectors, including\nYOLO-v7 and RTMDet, when using a comparable number of parameters and FLOPs.\nTaking the XS version of YOLO-MS as an example, with only 4.5M learnable\nparameters and 8.7G FLOPs, it can achieve an AP score of 43%+ on MS COCO, which\nis about 2%+ higher than RTMDet with the same model size. Moreover, our work\ncan also be used as a plug-and-play module for other YOLO models. Typically,\nour method significantly improves the AP of YOLOv8 from 37%+ to 40%+ with even\nfewer parameters and FLOPs. Code is available at\nhttps://github.com/FishAndWasabi/YOLO-MS.\n","authors":["Yuming Chen","Xinbin Yuan","Ruiqi Wu","Jiabao Wang","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.05480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05478v1","updated":"2023-08-10T10:10:43Z","published":"2023-08-10T10:10:43Z","title":"Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D\n  Radar","summary":"  Recent developments and the beginning market introduction of high-resolution\nimaging 4D (3+1D) radar sensors have initialized deep learning-based radar\nperception research. We investigate deep learning-based models operating on\nradar point clouds for 3D object detection. 3D object detection on lidar point\ncloud data is a mature area of 3D vision. Many different architectures have\nbeen proposed, each with strengths and weaknesses. Due to similarities between\n3D lidar point clouds and 3+1D radar point clouds, those existing 3D object\ndetectors are a natural basis to start deep learning-based 3D object detection\non radar data. Thus, the first step is to analyze the detection performance of\nthe existing models on the new data modality and evaluate them in depth. In\norder to apply existing 3D point cloud object detectors developed for lidar\npoint clouds to the radar domain, they need to be adapted first. While some\ndetectors, such as PointPillars, have already been adapted to be applicable to\nradar data, we have adapted others, e.g., Voxel R-CNN, SECOND, PointRCNN, and\nPV-RCNN. To this end, we conduct a cross-model validation (evaluating a set of\nmodels on one particular data set) as well as a cross-data set validation\n(evaluating all models in the model set on several data sets). The\nhigh-resolution radar data used are the View-of-Delft and Astyx data sets.\nFinally, we evaluate several adaptations of the models and their training\nprocedures. We also discuss major factors influencing the detection performance\non radar data and propose possible solutions indicating potential future\nresearch avenues.\n","authors":["Patrick Palmer","Martin Krueger","Richard Altendorfer","Ganesh Adam","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.05478v1.pdf","comment":"Published at CVPR 2023 Workshop on 3D Vision and Robotics\n  (https://drive.google.com/file/d/1xj4R5ucH3PaR7QdRDJbbkjS-3iBUsruR/view)"},{"id":"http://arxiv.org/abs/2308.05474v1","updated":"2023-08-10T10:01:56Z","published":"2023-08-10T10:01:56Z","title":"Surface Masked AutoEncoder: Self-Supervision for Cortical Imaging Data","summary":"  Self-supervision has been widely explored as a means of addressing the lack\nof inductive biases in vision transformer architectures, which limits\ngeneralisation when networks are trained on small datasets. This is crucial in\nthe context of cortical imaging, where phenotypes are complex and\nheterogeneous, but the available datasets are limited in size. This paper\nbuilds upon recent advancements in translating vision transformers to surface\nmeshes and investigates the potential of Masked AutoEncoder (MAE)\nself-supervision for cortical surface learning. By reconstructing surface data\nfrom a masked version of the input, the proposed method effectively models\ncortical structure to learn strong representations that translate to improved\nperformance in downstream tasks. We evaluate our approach on cortical phenotype\nregression using the developing Human Connectome Project (dHCP) and demonstrate\nthat pre-training leads to a 26\\% improvement in performance, with an 80\\%\nfaster convergence, compared to models trained from scratch. Furthermore, we\nestablish that pre-training vision transformer models on large datasets, such\nas the UK Biobank (UKB), enables the acquisition of robust representations for\nfinetuning in low-data scenarios. Our code and pre-trained models are publicly\navailable at \\url{https://github.com/metrics-lab/surface-vision-transformers}.\n","authors":["Simon Dahan","Mariana da Silva","Daniel Rueckert","Emma C Robinson"],"pdf_url":"https://arxiv.org/pdf/2308.05474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08695v2","updated":"2023-08-10T09:36:06Z","published":"2023-07-17T17:57:01Z","title":"Neural Video Depth Stabilizer","summary":"  Video depth estimation aims to infer temporally consistent depth. Some\nmethods achieve temporal consistency by finetuning a single-image depth model\nduring test time using geometry and re-projection constraints, which is\ninefficient and not robust. An alternative approach is to learn how to enforce\ntemporal consistency from data, but this requires well-designed models and\nsufficient video depth data. To address these challenges, we propose a\nplug-and-play framework called Neural Video Depth Stabilizer (NVDS) that\nstabilizes inconsistent depth estimations and can be applied to different\nsingle-image depth models without extra effort. We also introduce a large-scale\ndataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with\nover two million frames, making it the largest natural-scene video depth\ndataset to our knowledge. We evaluate our method on the VDW dataset as well as\ntwo public benchmarks and demonstrate significant improvements in consistency,\naccuracy, and efficiency compared to previous approaches. Our work serves as a\nsolid baseline and provides a data foundation for learning-based video depth\nmodels. We will release our dataset and code for future research.\n","authors":["Yiran Wang","Min Shi","Jiaqi Li","Zihao Huang","Zhiguo Cao","Jianming Zhang","Ke Xian","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2307.08695v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.05459v1","updated":"2023-08-10T09:32:20Z","published":"2023-08-10T09:32:20Z","title":"KS-APR: Keyframe Selection for Robust Absolute Pose Regression","summary":"  Markerless Mobile Augmented Reality (AR) aims to anchor digital content in\nthe physical world without using specific 2D or 3D objects. Absolute Pose\nRegressors (APR) are end-to-end machine learning solutions that infer the\ndevice's pose from a single monocular image. Thanks to their low computation\ncost, they can be directly executed on the constrained hardware of mobile AR\ndevices. However, APR methods tend to yield significant inaccuracies for input\nimages that are too distant from the training set. This paper introduces\nKS-APR, a pipeline that assesses the reliability of an estimated pose with\nminimal overhead by combining the inference results of the APR and the prior\nimages in the training set. Mobile AR systems tend to rely upon visual-inertial\nodometry to track the relative pose of the device during the experience. As\nsuch, KS-APR favours reliability over frequency, discarding unreliable poses.\nThis pipeline can integrate most existing APR methods to improve accuracy by\nfiltering unreliable images with their pose estimates. We implement the\npipeline on three types of APR models on indoor and outdoor datasets. The\nmedian error on position and orientation is reduced for all models, and the\nproportion of large errors is minimized across datasets. Our method enables\nstate-of-the-art APRs such as DFNetdm to outperform single-image and sequential\nAPR methods. These results demonstrate the scalability and effectiveness of\nKS-APR for visual localization tasks that do not require one-shot decisions.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2308.05459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10705v5","updated":"2023-08-10T09:27:44Z","published":"2022-11-19T14:06:58Z","title":"TORE: Token Reduction for Efficient Human Mesh Recovery with Transformer","summary":"  In this paper, we introduce a set of simple yet effective TOken REduction\n(TORE) strategies for Transformer-based Human Mesh Recovery from monocular\nimages. Current SOTA performance is achieved by Transformer-based structures.\nHowever, they suffer from high model complexity and computation cost caused by\nredundant tokens. We propose token reduction strategies based on two important\naspects, i.e., the 3D geometry structure and 2D image feature, where we\nhierarchically recover the mesh geometry with priors from body structure and\nconduct token clustering to pass fewer but more discriminative image feature\ntokens to the Transformer. Our method massively reduces the number of tokens\ninvolved in high-complexity interactions in the Transformer. This leads to a\nsignificantly reduced computational cost while still achieving competitive or\neven higher accuracy in shape recovery. Extensive experiments across a wide\nrange of benchmarks validate the superior effectiveness of the proposed method.\nWe further demonstrate the generalizability of our method on hand mesh\nrecovery. Visit our project page at\nhttps://frank-zy-dou.github.io/projects/Tore/index.html.\n","authors":["Zhiyang Dou","Qingxuan Wu","Cheng Lin","Zeyu Cao","Qiangqiang Wu","Weilin Wan","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2211.10705v5.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05449v1","updated":"2023-08-10T09:15:15Z","published":"2023-08-10T09:15:15Z","title":"Transforming Breast Cancer Diagnosis: Towards Real-Time Ultrasound to\n  Mammogram Conversion for Cost-Effective Diagnosis","summary":"  Ultrasound (US) imaging is better suited for intraoperative settings because\nit is real-time and more portable than other imaging techniques, such as\nmammography. However, US images are characterized by lower spatial resolution\nnoise-like artifacts. This research aims to address these limitations by\nproviding surgeons with mammogram-like image quality in real-time from noisy US\nimages. Unlike previous approaches for improving US image quality that aim to\nreduce artifacts by treating them as (speckle noise), we recognize their value\nas informative wave interference pattern (WIP). To achieve this, we utilize the\nStride software to numerically solve the forward model, generating ultrasound\nimages from mammograms images by solving wave-equations. Additionally, we\nleverage the power of domain adaptation to enhance the realism of the simulated\nultrasound images. Then, we utilize generative adversarial networks (GANs) to\ntackle the inverse problem of generating mammogram-quality images from\nultrasound images. The resultant images have considerably more discernible\ndetails than the original US images.\n","authors":["Sahar Almahfouz Nasser","Ashutosh Sharma","Anmol Saraf","Amruta Mahendra Parulekar","Purvi Haria","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2308.05449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01097v4","updated":"2023-08-10T09:10:35Z","published":"2023-07-03T15:19:17Z","title":"MVDiffusion: Enabling Holistic Multi-view Image Generation with\n  Correspondence-Aware Diffusion","summary":"  This paper introduces MVDiffusion, a simple yet effective method for\ngenerating consistent multi-view images from text prompts given pixel-to-pixel\ncorrespondences (e.g., perspective crops from a panorama or multi-view images\ngiven depth maps and poses). Unlike prior methods that rely on iterative image\nwarping and inpainting, MVDiffusion simultaneously generates all images with a\nglobal awareness, effectively addressing the prevalent error accumulation\nissue. At its core, MVDiffusion processes perspective images in parallel with a\npre-trained text-to-image diffusion model, while integrating novel\ncorrespondence-aware attention layers to facilitate cross-view interactions.\nFor panorama generation, while only trained with 10k panoramas, MVDiffusion is\nable to generate high-resolution photorealistic images for arbitrary texts or\nextrapolate one perspective image to a 360-degree view. For multi-view\ndepth-to-image generation, MVDiffusion demonstrates state-of-the-art\nperformance for texturing a scene mesh. The project page is at\nhttps://mvdiffusion.github.io/.\n","authors":["Shitao Tang","Fuyang Zhang","Jiacheng Chen","Peng Wang","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2307.01097v4.pdf","comment":"Project page, https://mvdiffusion.github.io, new functionality,\n  improved results, better writing"},{"id":"http://arxiv.org/abs/2308.05447v1","updated":"2023-08-10T09:09:15Z","published":"2023-08-10T09:09:15Z","title":"A Generalized Physical-knowledge-guided Dynamic Model for Underwater\n  Image Enhancement","summary":"  Underwater images often suffer from color distortion and low contrast\nresulting in various image types, due to the scattering and absorption of light\nby water. While it is difficult to obtain high-quality paired training samples\nwith a generalized model. To tackle these challenges, we design a Generalized\nUnderwater image enhancement method via a Physical-knowledge-guided Dynamic\nModel (short for GUPDM), consisting of three parts: Atmosphere-based Dynamic\nStructure (ADS), Transmission-guided Dynamic Structure (TDS), and Prior-based\nMulti-scale Structure (PMS). In particular, to cover complex underwater scenes,\nthis study changes the global atmosphere light and the transmission to simulate\nvarious underwater image types (e.g., the underwater image color ranging from\nyellow to blue) through the formation model. We then design ADS and TDS that\nuse dynamic convolutions to adaptively extract prior information from\nunderwater images and generate parameters for PMS. These two modules enable the\nnetwork to select appropriate parameters for various water types adaptively.\nBesides, the multi-scale feature extraction module in PMS uses convolution\nblocks with different kernel sizes and obtains weights for each feature map via\nchannel attention block and fuses them to boost the receptive field of the\nnetwork. The source code will be available at\n\\href{https://github.com/shiningZZ/GUPDM}{https://github.com/shiningZZ/GUPDM}.\n","authors":["Pan Mu","Hanning Xu","Zheyuan Liu","Zheng Wang","Sixian Chan","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.05447v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2307.11074v2","updated":"2023-08-10T09:03:48Z","published":"2023-07-20T17:53:57Z","title":"Learning Dense UV Completion for Human Mesh Recovery","summary":"  Human mesh reconstruction from a single image is challenging in the presence\nof occlusion, which can be caused by self, objects, or other humans. Existing\nmethods either fail to separate human features accurately or lack proper\nsupervision for feature completion. In this paper, we propose Dense Inpainting\nHuman Mesh Recovery (DIMR), a two-stage method that leverages dense\ncorrespondence maps to handle occlusion. Our method utilizes a dense\ncorrespondence map to separate visible human features and completes human\nfeatures on a structured UV map dense human with an attention-based feature\ncompletion module. We also design a feature inpainting training procedure that\nguides the network to learn from unoccluded features. We evaluate our method on\nseveral datasets and demonstrate its superior performance under heavily\noccluded scenarios compared to other methods. Extensive experiments show that\nour method obviously outperforms prior SOTA methods on heavily occluded images\nand achieves comparable results on the standard benchmarks (3DPW).\n","authors":["Yanjun Wang","Qingping Sun","Wenjia Wang","Jun Ling","Zhongang Cai","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2307.11074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05441v1","updated":"2023-08-10T08:57:31Z","published":"2023-08-10T08:57:31Z","title":"Benchmarking Algorithmic Bias in Face Recognition: An Experimental\n  Approach Using Synthetic Faces and Human Evaluation","summary":"  We propose an experimental method for measuring bias in face recognition\nsystems. Existing methods to measure bias depend on benchmark datasets that are\ncollected in the wild and annotated for protected (e.g., race, gender) and\nnon-protected (e.g., pose, lighting) attributes. Such observational datasets\nonly permit correlational conclusions, e.g., \"Algorithm A's accuracy is\ndifferent on female and male faces in dataset X.\". By contrast, experimental\nmethods manipulate attributes individually and thus permit causal conclusions,\ne.g., \"Algorithm A's accuracy is affected by gender and skin color.\"\n  Our method is based on generating synthetic faces using a neural face\ngenerator, where each attribute of interest is modified independently while\nleaving all other attributes constant. Human observers crucially provide the\nground truth on perceptual identity similarity between synthetic image pairs.\nWe validate our method quantitatively by evaluating race and gender biases of\nthree research-grade face recognition models. Our synthetic pipeline reveals\nthat for these algorithms, accuracy is lower for Black and East Asian\npopulation subgroups. Our method can also quantify how perceptual changes in\nattributes affect face identity distances reported by these models. Our large\nsynthetic dataset, consisting of 48,000 synthetic face image pairs (10,200\nunique synthetic faces) and 555,000 human annotations (individual attributes\nand pairwise identity comparisons) is available to researchers in this\nimportant area.\n","authors":["Hao Liang","Pietro Perona","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05441v1.pdf","comment":"accepted to iccv2023; 18 figures"},{"id":"http://arxiv.org/abs/2308.05438v1","updated":"2023-08-10T08:52:08Z","published":"2023-08-10T08:52:08Z","title":"Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints\n  Voting for Robust 6D Object Pose Estimation","summary":"  One critical challenge in 6D object pose estimation from a single RGBD image\nis efficient integration of two different modalities, i.e., color and depth. In\nthis work, we tackle this problem by a novel Deep Fusion Transformer~(DFTr)\nblock that can aggregate cross-modality features for improving pose estimation.\nUnlike existing fusion methods, the proposed DFTr can better model\ncross-modality semantic correlation by leveraging their semantic similarity,\nsuch that globally enhanced features from different modalities can be better\nintegrated for improved information extraction. Moreover, to further improve\nrobustness and efficiency, we introduce a novel weighted vector-wise voting\nalgorithm that employs a non-iterative global optimization strategy for precise\n3D keypoint localization while achieving near real-time inference. Extensive\nexperiments show the effectiveness and strong generalization capability of our\nproposed 3D keypoint voting algorithm. Results on four widely used benchmarks\nalso demonstrate that our method outperforms the state-of-the-art methods by\nlarge margins.\n","authors":["Jun Zhou","Kai Chen","Linlin Xu","Qi Dou","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2308.05438v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.05430v1","updated":"2023-08-10T08:43:20Z","published":"2023-08-10T08:43:20Z","title":"Ensemble Modeling for Multimodal Visual Action Recognition","summary":"  In this work, we propose an ensemble modeling approach for multimodal action\nrecognition. We independently train individual modality models using a variant\nof focal loss tailored to handle the long-tailed distribution of the MECCANO\n[21] dataset. Based on the underlying principle of focal loss, which captures\nthe relationship between tail (scarce) classes and their prediction\ndifficulties, we propose an exponentially decaying variant of focal loss for\nour current task. It initially emphasizes learning from the hard misclassified\nexamples and gradually adapts to the entire range of examples in the dataset.\nThis annealing process encourages the model to strike a balance between\nfocusing on the sparse set of hard samples, while still leveraging the\ninformation provided by the easier ones. Additionally, we opt for the late\nfusion strategy to combine the resultant probability distributions from RGB and\nDepth modalities for final action prediction. Experimental evaluations on the\nMECCANO dataset demonstrate the effectiveness of our approach.\n","authors":["Jyoti Kini","Sarah Fleischer","Ishan Dave","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.05430v1.pdf","comment":"Technical Report accepted at the Multimodal Action Recognition\n  Challenge on the MECCANO Dataset - ICIAP 2023"},{"id":"http://arxiv.org/abs/2308.05428v1","updated":"2023-08-10T08:42:20Z","published":"2023-08-10T08:42:20Z","title":"Speech-Driven 3D Face Animation with Composite and Regional Facial\n  Movements","summary":"  Speech-driven 3D face animation poses significant challenges due to the\nintricacy and variability inherent in human facial movements. This paper\nemphasizes the importance of considering both the composite and regional\nnatures of facial movements in speech-driven 3D face animation. The composite\nnature pertains to how speech-independent factors globally modulate\nspeech-driven facial movements along the temporal dimension. Meanwhile, the\nregional nature alludes to the notion that facial movements are not globally\ncorrelated but are actuated by local musculature along the spatial dimension.\nIt is thus indispensable to incorporate both natures for engendering vivid\nanimation. To address the composite nature, we introduce an adaptive modulation\nmodule that employs arbitrary facial movements to dynamically adjust\nspeech-driven facial movements across frames on a global scale. To accommodate\nthe regional nature, our approach ensures that each constituent of the facial\nfeatures for every frame focuses on the local spatial movements of 3D faces.\nMoreover, we present a non-autoregressive backbone for translating audio to 3D\nfacial movements, which maintains high-frequency nuances of facial movements\nand facilitates efficient inference. Comprehensive experiments and user studies\ndemonstrate that our method surpasses contemporary state-of-the-art approaches\nboth qualitatively and quantitatively.\n","authors":["Haozhe Wu","Songtao Zhou","Jia Jia","Junliang Xing","Qi Wen","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05428v1.pdf","comment":"Accepted by MM 2023, 9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05426v1","updated":"2023-08-10T08:39:59Z","published":"2023-08-10T08:39:59Z","title":"Adaptive Low Rank Adaptation of Segment Anything to Salient Object\n  Detection","summary":"  Foundation models, such as OpenAI's GPT-3 and GPT-4, Meta's LLaMA, and\nGoogle's PaLM2, have revolutionized the field of artificial intelligence. A\nnotable paradigm shift has been the advent of the Segment Anything Model (SAM),\nwhich has exhibited a remarkable capability to segment real-world objects,\ntrained on 1 billion masks and 11 million images. Although SAM excels in\ngeneral object segmentation, it lacks the intrinsic ability to detect salient\nobjects, resulting in suboptimal performance in this domain. To address this\nchallenge, we present the Segment Salient Object Model (SSOM), an innovative\napproach that adaptively fine-tunes SAM for salient object detection by\nharnessing the low-rank structure inherent in deep learning. Comprehensive\nqualitative and quantitative evaluations across five challenging RGB benchmark\ndatasets demonstrate the superior performance of our approach, surpassing\nstate-of-the-art methods.\n","authors":["Ruikai Cui","Siyuan He","Shi Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.05426v1.pdf","comment":"13 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.05421v1","updated":"2023-08-10T08:29:36Z","published":"2023-08-10T08:29:36Z","title":"Progressive Spatio-temporal Perception for Audio-Visual Question\n  Answering","summary":"  Audio-Visual Question Answering (AVQA) task aims to answer questions about\ndifferent visual objects, sounds, and their associations in videos. Such\nnaturally multi-modal videos are composed of rich and complex dynamic\naudio-visual components, where most of which could be unrelated to the given\nquestions, or even play as interference in answering the content of interest.\nOppositely, only focusing on the question-aware audio-visual content could get\nrid of influence, meanwhile enabling the model to answer more efficiently. In\nthis paper, we propose a Progressive Spatio-Temporal Perception Network\n(PSTP-Net), which contains three modules that progressively identify key\nspatio-temporal regions w.r.t. questions. Specifically, a temporal segment\nselection module is first introduced to select the most relevant audio-visual\nsegments related to the given question. Then, a spatial region selection module\nis utilized to choose the most relevant regions associated with the question\nfrom the selected temporal segments. To further refine the selection of\nfeatures, an audio-guided visual attention module is employed to perceive the\nassociation between auido and selected spatial regions. Finally, the\nspatio-temporal features from these modules are integrated for answering the\nquestion. Extensive experimental results on the public MUSIC-AVQA and AVQA\ndatasets provide compelling evidence of the effectiveness and efficiency of\nPSTP-Net. Code is available at:\n\\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}\n","authors":["Guangyao Li","Wenxuan Hou","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2308.05421v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.05410v1","updated":"2023-08-10T08:10:01Z","published":"2023-08-10T08:10:01Z","title":"SC3K: Self-supervised and Coherent 3D Keypoints Estimation from Rotated,\n  Noisy, and Decimated Point Cloud Data","summary":"  This paper proposes a new method to infer keypoints from arbitrary object\ncategories in practical scenarios where point cloud data (PCD) are noisy,\ndown-sampled and arbitrarily rotated. Our proposed model adheres to the\nfollowing principles: i) keypoints inference is fully unsupervised (no\nannotation given), ii) keypoints position error should be low and resilient to\nPCD perturbations (robustness), iii) keypoints should not change their indexes\nfor the intra-class objects (semantic coherence), iv) keypoints should be close\nto or proximal to PCD surface (compactness). We achieve these desiderata by\nproposing a new self-supervised training strategy for keypoints estimation that\ndoes not assume any a priori knowledge of the object class, and a model\narchitecture with coupled auxiliary losses that promotes the desired keypoints\nproperties. We compare the keypoints estimated by the proposed approach with\nthose of the state-of-the-art unsupervised approaches. The experiments show\nthat our approach outperforms by estimating keypoints with improved coverage\n(+9.41%) while being semantically consistent (+4.66%) that best characterizes\nthe object's 3D shape for downstream tasks. Code and data are available at:\nhttps://github.com/IITPAVIS/SC3K\n","authors":["Mohammad Zohaib","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.05410v1.pdf","comment":"This paper has been accepted in International Conference on Computer\n  Vision (ICCV) 2023. For code and data, please refer to the following GitHub\n  page: https://github.com/IITPAVIS/SC3K"},{"id":"http://arxiv.org/abs/2207.03190v2","updated":"2023-08-10T08:06:05Z","published":"2022-07-07T09:44:44Z","title":"Learning Music-Dance Representations through Explicit-Implicit Rhythm\n  Synchronization","summary":"  Although audio-visual representation has been proved to be applicable in many\ndownstream tasks, the representation of dancing videos, which is more specific\nand always accompanied by music with complex auditory contents, remains\nchallenging and uninvestigated. Considering the intrinsic alignment between the\ncadent movement of dancer and music rhythm, we introduce MuDaR, a novel\nMusic-Dance Representation learning framework to perform the synchronization of\nmusic and dance rhythms both in explicit and implicit ways. Specifically, we\nderive the dance rhythms based on visual appearance and motion cues inspired by\nthe music rhythm analysis. Then the visual rhythms are temporally aligned with\nthe music counterparts, which are extracted by the amplitude of sound\nintensity. Meanwhile, we exploit the implicit coherence of rhythms implied in\naudio and visual streams by contrastive learning. The model learns the joint\nembedding by predicting the temporal consistency between audio-visual pairs.\nThe music-dance representation, together with the capability of detecting audio\nand visual rhythms, can further be applied to three downstream tasks: (a) dance\nclassification, (b) music-dance retrieval, and (c) music-dance retargeting.\nExtensive experiments demonstrate that our proposed framework outperforms other\nself-supervised methods by a large margin.\n","authors":["Jiashuo Yu","Junfu Pu","Ying Cheng","Rui Feng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2207.03190v2.pdf","comment":"Accepted for publication in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2308.05407v1","updated":"2023-08-10T08:03:58Z","published":"2023-08-10T08:03:58Z","title":"A Comparative Assessment of Multi-view fusion learning for Crop\n  Classification","summary":"  With a rapidly increasing amount and diversity of remote sensing (RS) data\nsources, there is a strong need for multi-view learning modeling. This is a\ncomplex task when considering the differences in resolution, magnitude, and\nnoise of RS data. The typical approach for merging multiple RS sources has been\ninput-level fusion, but other - more advanced - fusion strategies may\noutperform this traditional approach. This work assesses different fusion\nstrategies for crop classification in the CropHarvest dataset. The fusion\nmethods proposed in this work outperform models based on individual views and\nprevious fusion methods. We do not find one single fusion method that\nconsistently outperforms all other approaches. Instead, we present a comparison\nof multi-view fusion methods for three different datasets and show that,\ndepending on the test region, different methods obtain the best performance.\nDespite this, we suggest a preliminary criterion for the selection of fusion\nmethods.\n","authors":["Francisco Mena","Diego Arenas","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.05407v1.pdf","comment":"Accepted at IEEE International Geoscience and Remote Sensing\n  Symposium 2023"},{"id":"http://arxiv.org/abs/2308.05404v1","updated":"2023-08-10T07:53:06Z","published":"2023-08-10T07:53:06Z","title":"Enhancing Low-light Light Field Images with A Deep Compensation\n  Unfolding Network","summary":"  This paper presents a novel and interpretable end-to-end learning framework,\ncalled the deep compensation unfolding network (DCUNet), for restoring light\nfield (LF) images captured under low-light conditions. DCUNet is designed with\na multi-stage architecture that mimics the optimization process of solving an\ninverse imaging problem in a data-driven fashion. The framework uses the\nintermediate enhanced result to estimate the illumination map, which is then\nemployed in the unfolding process to produce a new enhanced result.\nAdditionally, DCUNet includes a content-associated deep compensation module at\neach optimization stage to suppress noise and illumination map estimation\nerrors. To properly mine and leverage the unique characteristics of LF images,\nthis paper proposes a pseudo-explicit feature interaction module that\ncomprehensively exploits redundant information in LF images. The experimental\nresults on both simulated and real datasets demonstrate the superiority of our\nDCUNet over state-of-the-art methods, both qualitatively and quantitatively.\nMoreover, DCUNet preserves the essential geometric structure of enhanced LF\nimages much better. The code will be publicly available at\nhttps://github.com/lyuxianqiang/LFLL-DCU.\n","authors":["Xianqiang Lyu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2308.05404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v2","updated":"2023-08-10T07:38:35Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v2.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2303.05194v2","updated":"2023-08-10T07:30:14Z","published":"2023-03-09T11:48:29Z","title":"Contrastive Model Adaptation for Cross-Condition Robustness in Semantic\n  Segmentation","summary":"  Standard unsupervised domain adaptation methods adapt models from a source to\na target domain using labeled source data and unlabeled target data jointly. In\nmodel adaptation, on the other hand, access to the labeled source data is\nprohibited, i.e., only the source-trained model and unlabeled target data are\navailable. We investigate normal-to-adverse condition model adaptation for\nsemantic segmentation, whereby image-level correspondences are available in the\ntarget domain. The target set consists of unlabeled pairs of adverse- and\nnormal-condition street images taken at GPS-matched locations. Our method --\nCMA -- leverages such image pairs to learn condition-invariant features via\ncontrastive learning. In particular, CMA encourages features in the embedding\nspace to be grouped according to their condition-invariant semantic content and\nnot according to the condition under which respective inputs are captured. To\nobtain accurate cross-domain semantic correspondences, we warp the normal image\nto the viewpoint of the adverse image and leverage warp-confidence scores to\ncreate robust, aggregated features. With this approach, we achieve\nstate-of-the-art semantic segmentation performance for model adaptation on\nseveral normal-to-adverse adaptation benchmarks, such as ACDC and Dark Zurich.\nWe also evaluate CMA on a newly procured adverse-condition generalization\nbenchmark and report favorable results compared to standard unsupervised domain\nadaptation methods, despite the comparative handicap of CMA due to source data\ninaccessibility. Code is available at https://github.com/brdav/cma.\n","authors":["David Bruggemann","Christos Sakaridis","Tim Brödermann","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.05194v2.pdf","comment":"International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.05396v1","updated":"2023-08-10T07:28:22Z","published":"2023-08-10T07:28:22Z","title":"Learning Gabor Texture Features for Fine-Grained Recognition","summary":"  Extracting and using class-discriminative features is critical for\nfine-grained recognition. Existing works have demonstrated the possibility of\napplying deep CNNs to exploit features that distinguish similar classes.\nHowever, CNNs suffer from problems including frequency bias and loss of\ndetailed local information, which restricts the performance of recognizing\nfine-grained categories. To address the challenge, we propose a novel texture\nbranch as complimentary to the CNN branch for feature extraction. We\ninnovatively utilize Gabor filters as a powerful extractor to exploit texture\nfeatures, motivated by the capability of Gabor filters in effectively capturing\nmulti-frequency features and detailed local information. We implement several\ndesigns to enhance the effectiveness of Gabor filters, including imposing\nconstraints on parameter values and developing a learning method to determine\nthe optimal parameters. Moreover, we introduce a statistical feature extractor\nto utilize informative statistical information from the signals captured by\nGabor filters, and a gate selection mechanism to enable efficient computation\nby only considering qualified regions as input for texture extraction. Through\nthe integration of features from the Gabor-filter-based texture branch and\nCNN-based semantic branch, we achieve comprehensive information extraction. We\ndemonstrate the efficacy of our method on multiple datasets, including\nCUB-200-2011, NA-bird, Stanford Dogs, and GTOS-mobile. State-of-the-art\nperformance is achieved using our approach.\n","authors":["Lanyun Zhu","Tianrun Chen","Jianxiong Yin","Simon See","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05396v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.05394v1","updated":"2023-08-10T07:21:35Z","published":"2023-08-10T07:21:35Z","title":"Robust Localization with Visual-Inertial Odometry Constraints for\n  Markerless Mobile AR","summary":"  Visual Inertial Odometry (VIO) is an essential component of modern Augmented\nReality (AR) applications. However, VIO only tracks the relative pose of the\ndevice, leading to drift over time. Absolute pose estimation methods infer the\ndevice's absolute pose, but their accuracy depends on the input quality. This\npaper introduces VIO-APR, a new framework for markerless mobile AR that\ncombines an absolute pose regressor (APR) with a local VIO tracking system.\nVIO-APR uses VIO to assess the reliability of the APR and the APR to identify\nand compensate for VIO drift. This feedback loop results in more accurate\npositioning and more stable AR experiences. To evaluate VIO-APR, we created a\ndataset that combines camera images with ARKit's VIO system output for six\nindoor and outdoor scenes of various scales. Over this dataset, VIO-APR\nimproves the median accuracy of popular APR by up to 36\\% in position and 29\\%\nin orientation, increases the percentage of frames in the high ($0.25 m,\n2^{\\circ}$) accuracy level by up to 112\\% and reduces the percentage of frames\npredicted below the low ($5 m, 10^\\circ$) accuracy greatly. We implement\nVIO-APR into a mobile AR application using Unity to demonstrate its\ncapabilities. VIO-APR results in noticeably more accurate localization and a\nmore stable overall experience.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2308.05394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":"  In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n  eCom'22)"},{"id":"http://arxiv.org/abs/2308.05387v1","updated":"2023-08-10T07:03:32Z","published":"2023-08-10T07:03:32Z","title":"HGDNet: A Height-Hierarchy Guided Dual-Decoder Network for Single View\n  Building Extraction and Height Estimation","summary":"  Unifying the correlative single-view satellite image building extraction and\nheight estimation tasks indicates a promising way to share representations and\nacquire generalist model for large-scale urban 3D reconstruction. However, the\ncommon spatial misalignment between building footprints and\nstereo-reconstructed nDSM height labels incurs degraded performance on both\ntasks. To address this issue, we propose a Height-hierarchy Guided Dual-decoder\nNetwork (HGDNet) to estimate building height. Under the guidance of synthesized\ndiscrete height-hierarchy nDSM, auxiliary height-hierarchical building\nextraction branch enhance the height estimation branch with implicit\nconstraints, yielding an accuracy improvement of more than 6% on the DFC 2023\ntrack2 dataset. Additional two-stage cascade architecture is adopted to achieve\nmore accurate building extraction. Experiments on the DFC 2023 Track 2 dataset\nshows the superiority of the proposed method in building height estimation\n({\\delta}1:0.8012), instance extraction (AP50:0.7730), and the final average\nscore 0.7871 ranks in the first place in test phase.\n","authors":["Chaoran Lu","Ningning Cao","Pan Zhang","Ting Liu","Baochai Peng","Guozhang Liu","Mengke Yuan","Sen Zhang","Simin Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04152v2","updated":"2023-08-10T07:02:13Z","published":"2023-08-08T09:32:43Z","title":"Empowering Vision-Language Models to Follow Interleaved Vision-Language\n  Instructions","summary":"  Multimodal Large Language Models (MLLMs) have recently sparked significant\ninterest, which demonstrates emergent capabilities to serve as a\ngeneral-purpose model for various vision-language tasks. However, existing\nmethods mainly focus on limited types of instructions with a single image as\nvisual context, which hinders the widespread availability of MLLMs. In this\npaper, we introduce the I4 benchmark to comprehensively evaluate the\ninstruction following ability on complicated interleaved vision-language\ninstructions, which involve intricate image-text sequential context, covering a\ndiverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture\nslides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a\ncommon defect of existing methods: the Visual Prompt Generator (VPG) trained on\nimage-captioning alignment objective tends to attend to common foreground\ninformation for captioning but struggles to extract specific information\nrequired by particular tasks. To address this issue, we propose a generic and\nlightweight controllable knowledge re-injection module, which utilizes the\nsophisticated reasoning ability of LLMs to control the VPG to conditionally\nextract instruction-specific visual information and re-inject it into the LLM.\nFurther, we introduce an annotation-free cross-attention guided counterfactual\nimage training strategy to methodically learn the proposed module by\ncollaborating a cascade of foundation models. Enhanced by the proposed module\nand training strategy, we present Cheetor, a Transformer-based MLLM that can\neffectively handle a wide variety of interleaved vision-language instructions\nand achieves state-of-the-art zero-shot performance across all tasks of I4,\nwithout high-quality multimodal instruction tuning data. Cheetor also exhibits\ncompetitive performance compared with state-of-the-art instruction tuned models\non MME benchmark.\n","authors":["Juncheng Li","Kaihang Pan","Zhiqi Ge","Minghe Gao","Hanwang Zhang","Wei Ji","Wenqiao Zhang","Tat-Seng Chua","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.04152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05382v1","updated":"2023-08-10T06:55:51Z","published":"2023-08-10T06:55:51Z","title":"Interaction-aware Joint Attention Estimation Using People Attributes","summary":"  This paper proposes joint attention estimation in a single image. Different\nfrom related work in which only the gaze-related attributes of people are\nindependently employed, (I) their locations and actions are also employed as\ncontextual cues for weighting their attributes, and (ii) interactions among all\nof these attributes are explicitly modeled in our method. For the interaction\nmodeling, we propose a novel Transformer-based attention network to encode\njoint attention as low-dimensional features. We introduce a specialized MLP\nhead with positional embedding to the Transformer so that it predicts pixelwise\nconfidence of joint attention for generating the confidence heatmap. This\npixelwise prediction improves the heatmap accuracy by avoiding the ill-posed\nproblem in which the high-dimensional heatmap is predicted from the\nlow-dimensional features. The estimated joint attention is further improved by\nbeing integrated with general image-based attention estimation. Our method\noutperforms SOTA methods quantitatively in comparative experiments. Code:\nhttps://anonymous.4open.science/r/anonymized_codes-ECA4.\n","authors":["Chihiro Nakatani","Hiroaki Kawashima","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2308.05382v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2303.06601v2","updated":"2023-08-10T06:53:11Z","published":"2023-03-12T08:05:30Z","title":"Multi-metrics adaptively identifies backdoors in Federated learning","summary":"  The decentralized and privacy-preserving nature of federated learning (FL)\nmakes it vulnerable to backdoor attacks aiming to manipulate the behavior of\nthe resulting model on specific adversary-chosen inputs. However, most existing\ndefenses based on statistical differences take effect only against specific\nattacks, especially when the malicious gradients are similar to benign ones or\nthe data are highly non-independent and identically distributed (non-IID). In\nthis paper, we revisit the distance-based defense methods and discover that i)\nEuclidean distance becomes meaningless in high dimensions and ii) malicious\ngradients with diverse characteristics cannot be identified by a single metric.\nTo this end, we present a simple yet effective defense strategy with\nmulti-metrics and dynamic weighting to identify backdoors adaptively.\nFurthermore, our novel defense has no reliance on predefined assumptions over\nattack settings or data distributions and little impact on benign performance.\nTo evaluate the effectiveness of our approach, we conduct comprehensive\nexperiments on different datasets under various attack settings, where our\nmethod achieves the best defensive performance. For instance, we achieve the\nlowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing\nsignificant superiority over previous defenses. The results also demonstrate\nthat our method can be well-adapted to a wide range of non-IID degrees without\nsacrificing the benign performance.\n","authors":["Siquan Huang","Yijiang Li","Chong Chen","Leyu Shi","Ying Gao"],"pdf_url":"https://arxiv.org/pdf/2303.06601v2.pdf","comment":"14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International\n  Conference on Computer Vision (ICCV)"},{"id":"http://arxiv.org/abs/2308.05371v1","updated":"2023-08-10T06:40:19Z","published":"2023-08-10T06:40:19Z","title":"Flexible Isosurface Extraction for Gradient-Based Mesh Optimization","summary":"  This work considers gradient-based mesh optimization, where we iteratively\noptimize for a 3D surface mesh by representing it as the isosurface of a scalar\nfield, an increasingly common paradigm in applications including\nphotogrammetry, generative modeling, and inverse physics. Existing\nimplementations adapt classic isosurface extraction algorithms like Marching\nCubes or Dual Contouring; these techniques were designed to extract meshes from\nfixed, known fields, and in the optimization setting they lack the degrees of\nfreedom to represent high-quality feature-preserving meshes, or suffer from\nnumerical instabilities. We introduce FlexiCubes, an isosurface representation\nspecifically designed for optimizing an unknown mesh with respect to geometric,\nvisual, or even physical objectives. Our main insight is to introduce\nadditional carefully-chosen parameters into the representation, which allow\nlocal flexible adjustments to the extracted mesh geometry and connectivity.\nThese parameters are updated along with the underlying scalar field via\nautomatic differentiation when optimizing for a downstream task. We base our\nextraction scheme on Dual Marching Cubes for improved topological properties,\nand present extensions to optionally generate tetrahedral and\nhierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on\nboth synthetic benchmarks and real-world applications, showing that it offers\nsignificant improvements in mesh quality and geometric fidelity.\n","authors":["Tianchang Shen","Jacob Munkberg","Jon Hasselgren","Kangxue Yin","Zian Wang","Wenzheng Chen","Zan Gojcic","Sanja Fidler","Nicholas Sharp","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05371v1.pdf","comment":"SIGGRAPH 2023. Project page:\n  https://research.nvidia.com/labs/toronto-ai/flexicubes/"},{"id":"http://arxiv.org/abs/2308.05365v1","updated":"2023-08-10T06:20:00Z","published":"2023-08-10T06:20:00Z","title":"TriDo-Former: A Triple-Domain Transformer for Direct PET Reconstruction\n  from Low-Dose Sinograms","summary":"  To obtain high-quality positron emission tomography (PET) images while\nminimizing radiation exposure, various methods have been proposed for\nreconstructing standard-dose PET (SPET) images from low-dose PET (LPET)\nsinograms directly. However, current methods often neglect boundaries during\nsinogram-to-image reconstruction, resulting in high-frequency distortion in the\nfrequency domain and diminished or fuzzy edges in the reconstructed images.\nFurthermore, the convolutional architectures, which are commonly used, lack the\nability to model long-range non-local interactions, potentially leading to\ninaccurate representations of global structures. To alleviate these problems,\nwe propose a transformer-based model that unites triple domains of sinogram,\nimage, and frequency for direct PET reconstruction, namely TriDo-Former.\nSpecifically, the TriDo-Former consists of two cascaded networks, i.e., a\nsinogram enhancement transformer (SE-Former) for denoising the input LPET\nsinograms and a spatial-spectral reconstruction transformer (SSR-Former) for\nreconstructing SPET images from the denoised sinograms. Different from the\nvanilla transformer that splits an image into 2D patches, based specifically on\nthe PET imaging mechanism, our SE-Former divides the sinogram into 1D\nprojection view angles to maintain its inner-structure while denoising,\npreventing the noise in the sinogram from prorogating into the image domain.\nMoreover, to mitigate high-frequency distortion and improve reconstruction\ndetails, we integrate global frequency parsers (GFPs) into SSR-Former. The GFP\nserves as a learnable frequency filter that globally adjusts the frequency\ncomponents in the frequency domain, enforcing the network to restore\nhigh-frequency details resembling real SPET images. Validations on a clinical\ndataset demonstrate that our TriDo-Former outperforms the state-of-the-art\nmethods qualitatively and quantitatively.\n","authors":["Jiaqi Cui","Pinxian Zeng","Xinyi Zeng","Peng Wang","Xi Wu","Jiliu Zhou","Yan Wang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2308.05365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05359v1","updated":"2023-08-10T05:56:53Z","published":"2023-08-10T05:56:53Z","title":"Pseudo-label Alignment for Semi-supervised Instance Segmentation","summary":"  Pseudo-labeling is significant for semi-supervised instance segmentation,\nwhich generates instance masks and classes from unannotated images for\nsubsequent training. However, in existing pipelines, pseudo-labels that contain\nvaluable information may be directly filtered out due to mismatches in class\nand mask quality. To address this issue, we propose a novel framework, called\npseudo-label aligning instance segmentation (PAIS), in this paper. In PAIS, we\ndevise a dynamic aligning loss (DALoss) that adjusts the weights of\nsemi-supervised loss terms with varying class and mask score pairs. Through\nextensive experiments conducted on the COCO and Cityscapes datasets, we\ndemonstrate that PAIS is a promising framework for semi-supervised instance\nsegmentation, particularly in cases where labeled data is severely limited.\nNotably, with just 1\\% labeled data, PAIS achieves 21.2 mAP (based on\nMask-RCNN) and 19.9 mAP (based on K-Net) on the COCO dataset, outperforming the\ncurrent state-of-the-art model, \\ie, NoisyBoundary with 7.7 mAP, by a margin of\nover 12 points. Code is available at: \\url{https://github.com/hujiecpp/PAIS}.\n","authors":["Jie Hu","Chen Chen","Liujuan Cao","Shengchuan Zhang","Annan Shu","Guannan Jiang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.05359v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05358v1","updated":"2023-08-10T05:54:57Z","published":"2023-08-10T05:54:57Z","title":"Fine-grained building roof instance segmentation based on domain adapted\n  pretraining and composite dual-backbone","summary":"  The diversity of building architecture styles of global cities situated on\nvarious landforms, the degraded optical imagery affected by clouds and shadows,\nand the significant inter-class imbalance of roof types pose challenges for\ndesigning a robust and accurate building roof instance segmentor. To address\nthese issues, we propose an effective framework to fulfill semantic\ninterpretation of individual buildings with high-resolution optical satellite\nimagery. Specifically, the leveraged domain adapted pretraining strategy and\ncomposite dual-backbone greatly facilitates the discriminative feature\nlearning. Moreover, new data augmentation pipeline, stochastic weight averaging\n(SWA) training and instance segmentation based model ensemble in testing are\nutilized to acquire additional performance boost. Experiment results show that\nour approach ranks in the first place of the 2023 IEEE GRSS Data Fusion Contest\n(DFC) Track 1 test phase ($mAP_{50}$:50.6\\%). Note-worthily, we have also\nexplored the potential of multimodal data fusion with both optical satellite\nimagery and SAR data.\n","authors":["Guozhang Liu","Baochai Peng","Ting Liu","Pan Zhang","Mengke Yuan","Chaoran Lu","Ningning Cao","Sen Zhang","Simin Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05355v1","updated":"2023-08-10T05:51:21Z","published":"2023-08-10T05:51:21Z","title":"TCSloT: Text Guided 3D Context and Slope Aware Triple Network for Dental\n  Implant Position Prediction","summary":"  In implant prosthesis treatment, the surgical guide of implant is used to\nensure accurate implantation. However, such design heavily relies on the manual\nlocation of the implant position. When deep neural network has been proposed to\nassist the dentist in locating the implant position, most of them take a single\nslice as input, which do not fully explore 3D contextual information and\nignoring the influence of implant slope. In this paper, we design a Text Guided\n3D Context and Slope Aware Triple Network (TCSloT) which enables the perception\nof contextual information from multiple adjacent slices and awareness of\nvariation of implant slopes. A Texture Variation Perception (TVP) module is\ncorrespondingly elaborated to process the multiple slices and capture the\ntexture variation among slices and a Slope-Aware Loss (SAL) is proposed to\ndynamically assign varying weights for the regression head. Additionally, we\ndesign a conditional text guidance (CTG) module to integrate the text condition\n(i.e., left, middle and right) from the CLIP for assisting the implant position\nprediction. Extensive experiments on a dental implant dataset through five-fold\ncross-validation demonstrated that the proposed TCSloT achieves superior\nperformance than existing methods.\n","authors":["Xinquan Yang","Jinheng Xie","Xuechen Li","Xuguang Li","Linlin Shen","Yongqiang Deng"],"pdf_url":"https://arxiv.org/pdf/2308.05355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05346v1","updated":"2023-08-10T05:27:43Z","published":"2023-08-10T05:27:43Z","title":"Towards General and Fast Video Derain via Knowledge Distillation","summary":"  As a common natural weather condition, rain can obscure video frames and thus\naffect the performance of the visual system, so video derain receives a lot of\nattention. In natural environments, rain has a wide variety of streak types,\nwhich increases the difficulty of the rain removal task. In this paper, we\npropose a Rain Review-based General video derain Network via knowledge\ndistillation (named RRGNet) that handles different rain streak types with one\npre-training weight. Specifically, we design a frame grouping-based\nencoder-decoder network that makes full use of the temporal information of the\nvideo. Further, we use the old task model to guide the current model in\nlearning new rain streak types while avoiding forgetting. To consolidate the\nnetwork's ability to derain, we design a rain review module to play back data\nfrom old tasks for the current model. The experimental results show that our\ndeveloped general method achieves the best results in terms of running speed\nand derain effect.\n","authors":["Defang Cai","Pan Mu","Sixian Chan","Zhanpeng Shao","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.05346v1.pdf","comment":"6 pages; Accepted at IEEE ICME"},{"id":"http://arxiv.org/abs/2308.05344v1","updated":"2023-08-10T05:20:25Z","published":"2023-08-10T05:20:25Z","title":"Prostate Age Gap (PAG): An MRI surrogate marker of aging for prostate\n  cancer detection","summary":"  Background: Prostate cancer (PC) MRI-based risk calculators are commonly\nbased on biological (e.g. PSA), MRI markers (e.g. volume), and patient age.\nWhilst patient age measures the amount of years an individual has existed,\nbiological age (BA) might better reflect the physiology of an individual.\nHowever, surrogates from prostate MRI and linkage with clinically significant\nPC (csPC) remain to be explored. Purpose: To obtain and evaluate Prostate Age\nGap (PAG) as an MRI marker tool for csPC risk. Study type: Retrospective.\nPopulation: A total of 7243 prostate MRI slices from 468 participants who had\nundergone prostate biopsies. A deep learning model was trained on 3223 MRI\nslices cropped around the gland from 81 low-grade PC (ncsPC, Gleason score <=6)\nand 131 negative cases and tested on the remaining 256 participants.\nAssessment: Chronological age was defined as the age of the participant at the\ntime of the visit and used to train the deep learning model to predict the age\nof the patient. Following, we obtained PAG, defined as the model predicted age\nminus the patient's chronological age. Multivariate logistic regression models\nwere used to estimate the association through odds ratio (OR) and predictive\nvalue of PAG and compared against PSA levels and PI-RADS>=3. Statistical tests:\nT-test, Mann-Whitney U test, Permutation test and ROC curve analysis. Results:\nThe multivariate adjusted model showed a significant difference in the odds of\nclinically significant PC (csPC, Gleason score >=7) (OR =3.78, 95% confidence\ninterval (CI):2.32-6.16, P <.001). PAG showed a better predictive ability when\ncompared to PI-RADS>=3 and adjusted by other risk factors, including PSA\nlevels: AUC =0.981 vs AUC =0.704, p<.001. Conclusion: PAG was significantly\nassociated with the risk of clinically significant PC and outperformed other\nwell-established PC risk factors.\n","authors":["Alvaro Fernandez-Quilez","Tobias Nordström","Fredrik Jäderling","Svein Reidar Kjosavik","Martin Eklund"],"pdf_url":"https://arxiv.org/pdf/2308.05344v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.05068v2","updated":"2023-08-10T04:26:42Z","published":"2023-08-09T16:58:03Z","title":"Geometric Learning-Based Transformer Network for Estimation of\n  Segmentation Errors","summary":"  Many segmentation networks have been proposed for 3D volumetric segmentation\nof tumors and organs at risk. Hospitals and clinical institutions seek to\naccelerate and minimize the efforts of specialists in image segmentation.\nStill, in case of errors generated by these networks, clinicians would have to\nmanually edit the generated segmentation maps. Given a 3D volume and its\nputative segmentation map, we propose an approach to identify and measure\nerroneous regions in the segmentation map. Our method can estimate error at any\npoint or node in a 3D mesh generated from a possibly erroneous volumetric\nsegmentation map, serving as a Quality Assurance tool. We propose a graph\nneural network-based transformer based on the Nodeformer architecture to\nmeasure and classify the segmentation errors at any point. We have evaluated\nour network on a high-resolution micro-CT dataset of the human inner-ear bony\nlabyrinth structure by simulating erroneous 3D segmentation maps. Our network\nincorporates a convolutional encoder to compute node-centric features from the\ninput micro-CT data, the Nodeformer to learn the latent graph embeddings, and a\nMulti-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our\nnetwork achieves a mean absolute error of ~0.042 over other Graph Neural\nNetworks (GNN) and an accuracy of 79.53% over other GNNs in estimating and\nclassifying the node-wise errors, respectively. We also put forth vertex-normal\nprediction as a custom pretext task for pre-training the CNN encoder to improve\nthe network's overall performance. Qualitative analysis shows the efficiency of\nour network in correctly classifying errors and reducing misclassifications.\n","authors":["Sneha Sree C","Mohammad Al Fahim","Keerthi Ram","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.05068v2.pdf","comment":"Accepted in MICCAI workshop on ShapeMI, 2023"},{"id":"http://arxiv.org/abs/2304.02970v3","updated":"2023-08-10T04:08:44Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Semantic Segmentation","summary":"  Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07304v2","updated":"2023-08-10T04:04:37Z","published":"2023-05-12T08:19:39Z","title":"CLIP-Count: Towards Text-Guided Zero-Shot Object Counting","summary":"  Recent advances in visual-language models have shown remarkable zero-shot\ntext-image matching ability that is transferable to downstream tasks such as\nobject detection and segmentation. Adapting these models for object counting,\nhowever, remains a formidable challenge. In this study, we first investigate\ntransferring vision-language models (VLMs) for class-agnostic object counting.\nSpecifically, we propose CLIP-Count, the first end-to-end pipeline that\nestimates density maps for open-vocabulary objects with text guidance in a\nzero-shot manner. To align the text embedding with dense visual features, we\nintroduce a patch-text contrastive loss that guides the model to learn\ninformative patch-level visual representations for dense prediction. Moreover,\nwe design a hierarchical patch-text interaction module to propagate semantic\ninformation across different resolution levels of visual features. Benefiting\nfrom the full exploitation of the rich image-text alignment knowledge of\npretrained VLMs, our method effectively generates high-quality density maps for\nobjects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech\ncrowd counting datasets demonstrate state-of-the-art accuracy and\ngeneralizability of the proposed method. Code is available:\nhttps://github.com/songrise/CLIP-Count.\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.07304v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04904v2","updated":"2023-08-10T03:52:49Z","published":"2023-08-09T12:04:36Z","title":"StableVQA: A Deep No-Reference Quality Assessment Model for Video\n  Stability","summary":"  Video shakiness is an unpleasant distortion of User Generated Content (UGC)\nvideos, which is usually caused by the unstable hold of cameras. In recent\nyears, many video stabilization algorithms have been proposed, yet no specific\nand accurate metric enables comprehensively evaluating the stability of videos.\nIndeed, most existing quality assessment models evaluate video quality as a\nwhole without specifically taking the subjective experience of video stability\ninto consideration. Therefore, these models cannot measure the video stability\nexplicitly and precisely when severe shakes are present. In addition, there is\nno large-scale video database in public that includes various degrees of shaky\nvideos with the corresponding subjective scores available, which hinders the\ndevelopment of Video Quality Assessment for Stability (VQA-S). To this end, we\nbuild a new database named StableDB that contains 1,952 diversely-shaky UGC\nvideos, where each video has a Mean Opinion Score (MOS) on the degree of video\nstability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S\nmodel named StableVQA, which consists of three feature extractors to acquire\nthe optical flow, semantic, and blur features respectively, and a regression\nlayer to predict the final stability score. Extensive experiments demonstrate\nthat the StableVQA achieves a higher correlation with subjective opinions than\nthe existing VQA-S models and generic VQA models. The database and codes are\navailable at https://github.com/QMME/StableVQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Wei Sun","Jun Jia","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05320v1","updated":"2023-08-10T03:44:10Z","published":"2023-08-10T03:44:10Z","title":"Adv-Inpainting: Generating Natural and Transferable Adversarial Patch\n  via Attention-guided Feature Fusion","summary":"  The rudimentary adversarial attacks utilize additive noise to attack facial\nrecognition (FR) models. However, because manipulating the total face is\nimpractical in the physical setting, most real-world FR attacks are based on\nadversarial patches, which limit perturbations to a small area. Previous\nadversarial patch attacks often resulted in unnatural patterns and clear\nboundaries that were easily noticeable. In this paper, we argue that generating\nadversarial patches with plausible content can result in stronger\ntransferability than using additive noise or directly sampling from the latent\nspace. To generate natural-looking and highly transferable adversarial patches,\nwe propose an innovative two-stage coarse-to-fine attack framework called\nAdv-Inpainting. In the first stage, we propose an attention-guided StyleGAN\n(Att-StyleGAN) that adaptively combines texture and identity features based on\nthe attention map to generate high-transferable and natural adversarial\npatches. In the second stage, we design a refinement network with a new\nboundary variance loss to further improve the coherence between the patch and\nits surrounding area. Experiment results demonstrate that Adv-Inpainting is\nstealthy and can produce adversarial patches with stronger transferability and\nimproved visual quality than previous adversarial patch attacks.\n","authors":["Yanjie Li","Mingxing Duan","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.05320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11316v2","updated":"2023-08-10T03:41:10Z","published":"2023-03-20T17:55:37Z","title":"Generative Semantic Segmentation","summary":"  We present Generative Semantic Segmentation (GSS), a generative learning\napproach for semantic segmentation. Uniquely, we cast semantic segmentation as\nan image-conditioned mask generation problem. This is achieved by replacing the\nconventional per-pixel discriminative learning with a latent prior learning\nprocess. Specifically, we model the variational posterior distribution of\nlatent variables given the segmentation mask. To that end, the segmentation\nmask is expressed with a special type of image (dubbed as maskige). This\nposterior distribution allows to generate segmentation masks unconditionally.\nTo achieve semantic segmentation on a given image, we further introduce a\nconditioning network. It is optimized by minimizing the divergence between the\nposterior distribution of maskige (i.e., segmentation masks) and the latent\nprior distribution of input training images. Extensive experiments on standard\nbenchmarks show that our GSS can perform competitively to prior art\nalternatives in the standard semantic segmentation setting, whilst achieving a\nnew state of the art in the more challenging cross-domain setting.\n","authors":["Jiaqi Chen","Jiachen Lu","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11316v2.pdf","comment":"To appear at CVPR2023, code at http://github.com/fudan-zvg/GSS"},{"id":"http://arxiv.org/abs/2308.05318v1","updated":"2023-08-10T03:14:19Z","published":"2023-08-10T03:14:19Z","title":"RLSAC: Reinforcement Learning enhanced Sample Consensus for End-to-End\n  Robust Estimation","summary":"  Robust estimation is a crucial and still challenging task, which involves\nestimating model parameters in noisy environments. Although conventional\nsampling consensus-based algorithms sample several times to achieve robustness,\nthese algorithms cannot use data features and historical information\neffectively. In this paper, we propose RLSAC, a novel Reinforcement Learning\nenhanced SAmple Consensus framework for end-to-end robust estimation. RLSAC\nemploys a graph neural network to utilize both data and memory features to\nguide exploring directions for sampling the next minimum set. The feedback of\ndownstream tasks serves as the reward for unsupervised training. Therefore,\nRLSAC can avoid differentiating to learn the features and the feedback of\ndownstream tasks for end-to-end robust estimation. In addition, RLSAC\nintegrates a state transition module that encodes both data and memory\nfeatures. Our experimental results demonstrate that RLSAC can learn from\nfeatures to gradually explore a better hypothesis. Through analysis, it is\napparent that RLSAC can be easily transferred to other sampling consensus-based\nrobust estimation tasks. To the best of our knowledge, RLSAC is also the first\nmethod that uses reinforcement learning to sample consensus for end-to-end\nrobust estimation. We release our codes at https://github.com/IRMVLab/RLSAC.\n","authors":["Chang Nie","Guangming Wang","Zhe Liu","Luca Cavalli","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05318v1.pdf","comment":"Accepted by ICCV2023. Codes are released at\n  https://github.com/IRMVLab/RLSAC"},{"id":"http://arxiv.org/abs/2308.05314v1","updated":"2023-08-10T03:07:28Z","published":"2023-08-10T03:07:28Z","title":"Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds\n  Registration","summary":"  The current point cloud registration methods are mainly based on geometric\ninformation and usually ignore the semantic information in the point clouds. In\nthis paper, we treat the point cloud registration problem as semantic instance\nmatching and registration task, and propose a deep semantic graph matching\nmethod for large-scale outdoor point cloud registration. Firstly, the semantic\ncategory labels of 3D point clouds are obtained by utilizing large-scale point\ncloud semantic segmentation network. The adjacent points with the same category\nlabels are then clustered together by using Euclidean clustering algorithm to\nobtain the semantic instances. Secondly, the semantic adjacency graph is\nconstructed based on the spatial adjacency relation of semantic instances.\nThree kinds of high-dimensional features including geometric shape features,\nsemantic categorical features and spatial distribution features are learned\nthrough graph convolutional network, and enhanced based on attention mechanism.\nThirdly, the semantic instance matching problem is modeled as an optimal\ntransport problem, and solved through an optimal matching layer. Finally,\naccording to the matched semantic instances, the geometric transformation\nmatrix between two point clouds is first obtained by SVD algorithm and then\nrefined by ICP algorithm. The experiments are cconducted on the KITTI Odometry\ndataset, and the average relative translation error and average relative\nrotation error of the proposed method are 6.6cm and 0.229{\\deg} respectively.\n","authors":["Shaocong Liu","Tao Wang","Yan Zhang","Ruqin Zhou","Li Li","Chenguang Dai","Yongsheng Zhang","Hanyun Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05314v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05311v1","updated":"2023-08-10T02:59:40Z","published":"2023-08-10T02:59:40Z","title":"DAOT: Domain-Agnostically Aligned Optimal Transport for Domain-Adaptive\n  Crowd Counting","summary":"  Domain adaptation is commonly employed in crowd counting to bridge the domain\ngaps between different datasets. However, existing domain adaptation methods\ntend to focus on inter-dataset differences while overlooking the\nintra-differences within the same dataset, leading to additional learning\nambiguities. These domain-agnostic factors, e.g., density, surveillance\nperspective, and scale, can cause significant in-domain variations, and the\nmisalignment of these factors across domains can lead to a drop in performance\nin cross-domain crowd counting. To address this issue, we propose a\nDomain-agnostically Aligned Optimal Transport (DAOT) strategy that aligns\ndomain-agnostic factors between domains. The DAOT consists of three steps.\nFirst, individual-level differences in domain-agnostic factors are measured\nusing structural similarity (SSIM). Second, the optimal transfer (OT) strategy\nis employed to smooth out these differences and find the optimal\ndomain-to-domain misalignment, with outlier individuals removed via a virtual\n\"dustbin\" column. Third, knowledge is transferred based on the aligned\ndomain-agnostic factors, and the model is retrained for domain adaptation to\nbridge the gap across domains. We conduct extensive experiments on five\nstandard crowd-counting benchmarks and demonstrate that the proposed method has\nstrong generalizability across diverse datasets. Our code will be available at:\nhttps://github.com/HopooLinZ/DAOT/.\n","authors":["Huilin Zhu","Jingling Yuan","Xian Zhong","Zhengwei Yang","Zheng Wang","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.05311v1.pdf","comment":"11 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.05305v1","updated":"2023-08-10T02:48:57Z","published":"2023-08-10T02:48:57Z","title":"From CNN to Transformer: A Review of Medical Image Segmentation Models","summary":"  Medical image segmentation is an important step in medical image analysis,\nespecially as a crucial prerequisite for efficient disease diagnosis and\ntreatment. The use of deep learning for image segmentation has become a\nprevalent trend. The widely adopted approach currently is U-Net and its\nvariants. Additionally, with the remarkable success of pre-trained models in\nnatural language processing tasks, transformer-based models like TransUNet have\nachieved desirable performance on multiple medical image segmentation datasets.\nIn this paper, we conduct a survey of the most representative four medical\nimage segmentation models in recent years. We theoretically analyze the\ncharacteristics of these models and quantitatively evaluate their performance\non two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).\nFinally, we discuss the main challenges and future trends in medical image\nsegmentation. Our work can assist researchers in the related field to quickly\nestablish medical segmentation models tailored to specific regions.\n","authors":["Wenjian Yao","Jiajun Bai","Wei Liao","Yuheng Chen","Mengjuan Liu","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05305v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05303v1","updated":"2023-08-10T02:47:36Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis,Calibration and Estimation","summary":"  In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16361v2","updated":"2023-08-10T02:45:55Z","published":"2023-07-31T01:34:24Z","title":"Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks\n  for Defending Adversarial Examples","summary":"  Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to\nadversarial examples, threatening their practical deployment. Despite the many\nresearch endeavors have been made to tackle this issue in recent years, the\ndiversity of adversarial examples on 3D point clouds makes them more\nchallenging to defend against than those on 2D images. For examples, attackers\ncan generate adversarial examples by adding, shifting, or removing points.\nConsequently, existing defense strategies are hard to counter unseen point\ncloud adversarial examples. In this paper, we first establish a comprehensive,\nand rigorous point cloud adversarial robustness benchmark to evaluate\nadversarial robustness, which can provide a detailed understanding of the\neffects of the defense and attack methods. We then collect existing defense\ntricks in point cloud adversarial defenses and then perform extensive and\nsystematic experiments to identify an effective combination of these tricks.\nFurthermore, we propose a hybrid training augmentation methods that consider\nvarious types of point cloud adversarial examples to adversarial training,\nsignificantly improving the adversarial robustness. By combining these tricks,\nwe construct a more robust defense framework achieving an average accuracy of\n83.45\\% against various attacks, demonstrating its capability to enabling\nrobust learners. Our codebase are open-sourced on:\n\\url{https://github.com/qiufan319/benchmark_pc_attack.git}.\n","authors":["Qiufan Ji","Lin Wang","Cong Shi","Shengshan Hu","Yingying Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16361v2.pdf","comment":"8 pages 6 figures"},{"id":"http://arxiv.org/abs/2308.05298v1","updated":"2023-08-10T02:41:18Z","published":"2023-08-10T02:41:18Z","title":"Double-chain Constraints for 3D Human Pose Estimation in Images and\n  Videos","summary":"  Reconstructing 3D poses from 2D poses lacking depth information is\nparticularly challenging due to the complexity and diversity of human motion.\nThe key is to effectively model the spatial constraints between joints to\nleverage their inherent dependencies. Thus, we propose a novel model, called\nDouble-chain Graph Convolutional Transformer (DC-GCT), to constrain the pose\nthrough a double-chain design consisting of local-to-global and global-to-local\nchains to obtain a complex representation more suitable for the current human\npose. Specifically, we combine the advantages of GCN and Transformer and design\na Local Constraint Module (LCM) based on GCN and a Global Constraint Module\n(GCM) based on self-attention mechanism as well as a Feature Interaction Module\n(FIM). The proposed method fully captures the multi-level dependencies between\nhuman body joints to optimize the modeling capability of the model. Moreover,\nwe propose a method to use temporal information into the single-frame model by\nguiding the video sequence embedding through the joint embedding of the target\nframe, with negligible increase in computational cost. Experimental results\ndemonstrate that DC-GCT achieves state-of-the-art performance on two\nchallenging datasets (Human3.6M and MPI-INF-3DHP). Notably, our model achieves\nstate-of-the-art performance on all action categories in the Human3.6M dataset\nusing detected 2D poses from CPN, and our code is available at:\nhttps://github.com/KHB1698/DC-GCT.\n","authors":["Hongbo Kang","Yong Wang","Mengyuan Liu","Doudou Wu","Peng Liu","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2308.05298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12384v3","updated":"2023-08-10T02:39:22Z","published":"2023-03-22T08:47:37Z","title":"RegFormer: An Efficient Projection-Aware Transformer Network for\n  Large-Scale Point Cloud Registration","summary":"  Although point cloud registration has achieved remarkable advances in\nobject-level and indoor scenes, large-scale registration methods are rarely\nexplored. Challenges mainly arise from the huge point number, complex\ndistribution, and outliers of outdoor LiDAR scans. In addition, most existing\nregistration works generally adopt a two-stage paradigm: They first find\ncorrespondences by extracting discriminative local features and then leverage\nestimators (eg. RANSAC) to filter outliers, which are highly dependent on\nwell-designed descriptors and post-processing choices. To address these\nproblems, we propose an end-to-end transformer network (RegFormer) for\nlarge-scale point cloud alignment without any further post-processing.\nSpecifically, a projection-aware hierarchical transformer is proposed to\ncapture long-range dependencies and filter outliers by extracting point\nfeatures globally. Our transformer has linear complexity, which guarantees high\nefficiency even for large-scale scenes. Furthermore, to effectively reduce\nmismatches, a bijective association transformer is designed for regressing the\ninitial transformation. Extensive experiments on KITTI and NuScenes datasets\ndemonstrate that our RegFormer achieves competitive performance in terms of\nboth accuracy and efficiency.\n","authors":["Jiuming Liu","Guangming Wang","Zhe Liu","Chaokang Jiang","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12384v3.pdf","comment":"Accepted by ICCV2023. Codes are released at\n  https://github.com/IRMVLab/RegFormer"},{"id":"http://arxiv.org/abs/2308.03463v3","updated":"2023-08-10T02:26:16Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03463v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.11029v2","updated":"2023-08-10T02:05:45Z","published":"2023-06-19T15:46:41Z","title":"RemoteCLIP: A Vision Language Foundation Model for Remote Sensing","summary":"  General-purpose foundation models have become increasingly important in the\nfield of artificial intelligence. While self-supervised learning (SSL) and\nMasked Image Modeling (MIM) have led to promising results in building such\nfoundation models for remote sensing, these models primarily learn low-level\nfeatures, require annotated data for fine-tuning, and not applicable for\nretrieval and zero-shot applications due to the lack of language understanding.\nIn response to these limitations, we propose RemoteCLIP, the first\nvision-language foundation model for remote sensing that aims to learn robust\nvisual features with rich semantics, as well as aligned text embeddings for\nseamless downstream application. To address the scarcity of pre-training data,\nwe leverage data scaling, converting heterogeneous annotations based on\nBox-to-Caption (B2C) and Mask-to-Box (M2B) conversion, and further\nincorporating UAV imagery, resulting a 12xlarger pretraining dataset.\nRemoteCLIP can be applied to a variety of downstream tasks, including zero-shot\nimage classification, linear probing, k-NN classification, few-shot\nclassification, image-text retrieval, and object counting. Evaluations on 16\ndatasets, including a newly introduced RemoteCount benchmark to test the object\ncounting ability, show that RemoteCLIP consistently outperforms baseline\nfoundation models across different model scales. Impressively, RemoteCLIP\noutperform previous SoTA by 9.14% mean recall on RSICD dataset and by 8.92% on\nRSICD dataset. For zero-shot classification, our RemoteCLIP outperform CLIP\nbaseline by up to 6.39% average accuracy on 12 downstream datasets.Pretrained\nmodels is available at https://github.com/ChenDelong1999/RemoteCLIP .\n","authors":["Fan Liu","Delong Chen","Zhangqingyun Guan","Xiaocong Zhou","Jiale Zhu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.11029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05286v1","updated":"2023-08-10T02:04:01Z","published":"2023-08-10T02:04:01Z","title":"Informative Scene Graph Generation via Debiasing","summary":"  Scene graph generation aims to detect visual relationship triplets, (subject,\npredicate, object). Due to biases in data, current models tend to predict\ncommon predicates, e.g. \"on\" and \"at\", instead of informative ones, e.g.\n\"standing on\" and \"looking at\". This tendency results in the loss of precise\ninformation and overall performance. If a model only uses \"stone on road\"\nrather than \"stone blocking road\" to describe an image, it may be a grave\nmisunderstanding. We argue that this phenomenon is caused by two imbalances:\nsemantic space level imbalance and training sample level imbalance. For this\nproblem, we propose DB-SGG, an effective framework based on debiasing but not\nthe conventional distribution fitting. It integrates two components: Semantic\nDebiasing (SD) and Balanced Predicate Learning (BPL), for these imbalances. SD\nutilizes a confusion matrix and a bipartite graph to construct predicate\nrelationships. BPL adopts a random undersampling strategy and an ambiguity\nremoving strategy to focus on informative predicates. Benefiting from the\nmodel-agnostic process, our method can be easily applied to SGG models and\noutperforms Transformer by 136.3%, 119.5%, and 122.6% on mR@20 at three SGG\nsub-tasks on the SGG-VG dataset. Our method is further verified on another\ncomplex SGG dataset (SGG-GQA) and two downstream tasks (sentence-to-graph\nretrieval and image captioning).\n","authors":["Lianli Gao","Xinyu Lyu","Yuyu Guo","Yuxuan Hu","Yuan-Fang Li","Lu Xu","Heng Tao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.05286v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2108.13129"},{"id":"http://arxiv.org/abs/2308.04952v2","updated":"2023-08-10T01:59:23Z","published":"2023-08-09T13:38:52Z","title":"Prototypical Kernel Learning and Open-set Foreground Perception for\n  Generalized Few-shot Semantic Segmentation","summary":"  Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic\nSegmentation (FSS) to simultaneously segment unseen classes and seen classes\nduring evaluation. Previous works leverage additional branch or prototypical\naggregation to eliminate the constrained setting of FSS. However,\nrepresentation division and embedding prejudice, which heavily results in poor\nperformance of GFSS, have not been synthetical considered. We address the\naforementioned problems by jointing the prototypical kernel learning and\nopen-set foreground perception. Specifically, a group of learnable kernels is\nproposed to perform segmentation with each kernel in charge of a stuff class.\nThen, we explore to merge the prototypical learning to the update of base-class\nkernels, which is consistent with the prototype knowledge aggregation of\nfew-shot novel classes. In addition, a foreground contextual perception module\ncooperating with conditional bias based inference is adopted to perform\nclass-agnostic as well as open-set foreground detection, thus to mitigate the\nembedding prejudice and prevent novel targets from being misclassified as\nbackground. Moreover, we also adjust our method to the Class Incremental\nFew-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel\nclasses in a incremental stream. Extensive experiments on PASCAL-5i and\nCOCO-20i datasets demonstrate that our method performs better than previous\nstate-of-the-art.\n","authors":["Kai Huang","Feigege Wang","Ye Xi","Yutao Gao"],"pdf_url":"https://arxiv.org/pdf/2308.04952v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.04733v2","updated":"2023-08-10T01:51:55Z","published":"2023-08-09T06:59:29Z","title":"TextPainter: Multimodal Text Image Generation withVisual-harmony and\n  Text-comprehension for Poster Design","summary":"  Text design is one of the most critical procedures in poster design, as it\nrelies heavily on the creativity and expertise of humans to design text images\nconsidering the visual harmony and text-semantic. This study introduces\nTextPainter, a novel multimodal approach that leverages contextual visual\ninformation and corresponding text semantics to generate text images.\nSpecifically, TextPainter takes the global-local background image as a hint of\nstyle and guides the text image generation with visual harmony. Furthermore, we\nleverage the language model and introduce a text comprehension module to\nachieve both sentence-level and word-level style variations. Besides, we\nconstruct the PosterT80K dataset, consisting of about 80K posters annotated\nwith sentence-level bounding boxes and text contents. We hope this dataset will\npave the way for further research on multimodal text image generation.\nExtensive quantitative and qualitative experiments demonstrate that TextPainter\ncan generate visually-and-semantically-harmonious text images for posters.\n","authors":["Yifan Gao","Jinpeng Lin","Min Zhou","Chuanbin Liu","Hongtao Xie","Tiezheng Ge","Yuning Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.04733v2.pdf","comment":"Accepted to ACM MM 2023. Dataset Link:\n  https://tianchi.aliyun.com/dataset/160034"},{"id":"http://arxiv.org/abs/2307.14527v2","updated":"2023-08-10T01:46:11Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n  Patricia Wu-Murad","summary":"  This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05274v1","updated":"2023-08-10T01:24:25Z","published":"2023-08-10T01:24:25Z","title":"Local-Global Information Interaction Debiasing for Dynamic Scene Graph\n  Generation","summary":"  The task of dynamic scene graph generation (DynSGG) aims to generate scene\ngraphs for given videos, which involves modeling the spatial-temporal\ninformation in the video. However, due to the long-tailed distribution of\nsamples in the dataset, previous DynSGG models fail to predict the tail\npredicates. We argue that this phenomenon is due to previous methods that only\npay attention to the local spatial-temporal information and neglect the\nconsistency of multiple frames. To solve this problem, we propose a novel\nDynSGG model based on multi-task learning, DynSGG-MTL, which introduces the\nlocal interaction information and global human-action interaction information.\nThe interaction between objects and frame features makes the model more fully\nunderstand the visual context of the single image. Long-temporal human actions\nsupervise the model to generate multiple scene graphs that conform to the\nglobal constraints and avoid the model being unable to learn the tail\npredicates. Extensive experiments on Action Genome dataset demonstrate the\nefficacy of our proposed framework, which not only improves the dynamic scene\ngraph generation but also alleviates the long-tail problem.\n","authors":["Xinyu Lyu","Jingwei Liu","Yuyu Guo","Lianli Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05264v1","updated":"2023-08-10T00:26:34Z","published":"2023-08-10T00:26:34Z","title":"TrainFors: A Large Benchmark Training Dataset for Image Manipulation\n  Detection and Localization","summary":"  The evaluation datasets and metrics for image manipulation detection and\nlocalization (IMDL) research have been standardized. But the training dataset\nfor such a task is still nonstandard. Previous researchers have used\nunconventional and deviating datasets to train neural networks for detecting\nimage forgeries and localizing pixel maps of manipulated regions. For a fair\ncomparison, the training set, test set, and evaluation metrics should be\npersistent. Hence, comparing the existing methods may not seem fair as the\nresults depend heavily on the training datasets as well as the model\narchitecture. Moreover, none of the previous works release the synthetic\ntraining dataset used for the IMDL task. We propose a standardized benchmark\ntraining dataset for image splicing, copy-move forgery, removal forgery, and\nimage enhancement forgery. Furthermore, we identify the problems with the\nexisting IMDL datasets and propose the required modifications. We also train\nthe state-of-the-art IMDL methods on our proposed TrainFors1 dataset for a fair\nevaluation and report the actual performance of these methods under similar\nconditions.\n","authors":["Soumyaroop Nandi","Prem Natarajan","Wael Abd-Almageed"],"pdf_url":"https://arxiv.org/pdf/2308.05264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08095v2","updated":"2023-08-10T00:22:27Z","published":"2022-11-15T12:25:33Z","title":"Will Large-scale Generative Models Corrupt Future Datasets?","summary":"  Recently proposed large-scale text-to-image generative models such as\nDALL$\\cdot$E 2, Midjourney, and StableDiffusion can generate high-quality and\nrealistic images from users' prompts. Not limited to the research community,\nordinary Internet users enjoy these generative models, and consequently, a\ntremendous amount of generated images have been shared on the Internet.\nMeanwhile, today's success of deep learning in the computer vision field owes a\nlot to images collected from the Internet. These trends lead us to a research\nquestion: \"\\textbf{will such generated images impact the quality of future\ndatasets and the performance of computer vision models positively or\nnegatively?}\" This paper empirically answers this question by simulating\ncontamination. Namely, we generate ImageNet-scale and COCO-scale datasets using\na state-of-the-art generative model and evaluate models trained with\n\"contaminated\" datasets on various tasks, including image classification and\nimage generation. Throughout experiments, we conclude that generated images\nnegatively affect downstream performance, while the significance depends on\ntasks and the amount of generated images. The generated datasets and the codes\nfor experiments will be publicly released for future research. Generated\ndatasets and source codes are available from\n\\url{https://github.com/moskomule/dataset-contamination}.\n","authors":["Ryuichiro Hataya","Han Bao","Hiromi Arai"],"pdf_url":"https://arxiv.org/pdf/2211.08095v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05881v1","updated":"2023-08-10T23:53:07Z","published":"2023-08-10T23:53:07Z","title":"Aphid Cluster Recognition and Detection in the Wild Using Deep Learning\n  Models","summary":"  Aphid infestation poses a significant threat to crop production, rural\ncommunities, and global food security. While chemical pest control is crucial\nfor maximizing yields, applying chemicals across entire fields is both\nenvironmentally unsustainable and costly. Hence, precise localization and\nmanagement of aphids are essential for targeted pesticide application. The\npaper primarily focuses on using deep learning models for detecting aphid\nclusters. We propose a novel approach for estimating infection levels by\ndetecting aphid clusters. To facilitate this research, we have captured a\nlarge-scale dataset from sorghum fields, manually selected 5,447 images\ncontaining aphids, and annotated each individual aphid cluster within these\nimages. To facilitate the use of machine learning models, we further process\nthe images by cropping them into patches, resulting in a labeled dataset\ncomprising 151,380 image patches. Then, we implemented and compared the\nperformance of four state-of-the-art object detection models (VFNet, GFLV2,\nPAA, and ATSS) on the aphid dataset. Extensive experimental results show that\nall models yield stable similar performance in terms of average precision and\nrecall. We then propose to merge close neighboring clusters and remove tiny\nclusters caused by cropping, and the performance is further boosted by around\n17%. The study demonstrates the feasibility of automatically detecting and\nmanaging insects using machine learning models. The labeled dataset will be\nmade openly available to the research community.\n","authors":["Tianxiao Zhang","Kaidong Li","Xiangyu Chen","Cuncong Zhong","Bo Luo","Ivan Grijalva","Brian McCornack","Daniel Flippo","Ajay Sharda","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07972v2","updated":"2023-08-10T23:40:37Z","published":"2023-07-16T07:51:18Z","title":"Dual-level Interaction for Domain Adaptive Semantic Segmentation","summary":"  Self-training approach recently secures its position in domain adaptive\nsemantic segmentation, where a model is trained with target domain\npseudo-labels. Current advances have mitigated noisy pseudo-labels resulting\nfrom the domain gap. However, they still struggle with erroneous pseudo-labels\nnear the boundaries of the semantic classifier. In this paper, we tackle this\nissue by proposing a dual-level interaction for domain adaptation (DIDA) in\nsemantic segmentation. Explicitly, we encourage the different augmented views\nof the same pixel to have not only similar class prediction (semantic-level)\nbut also akin similarity relationship with respect to other pixels\n(instance-level). As it's impossible to keep features of all pixel instances\nfor a dataset, we, therefore, maintain a labeled instance bank with dynamic\nupdating strategies to selectively store the informative features of instances.\nFurther, DIDA performs cross-level interaction with scattering and gathering\ntechniques to regenerate more reliable pseudo-labels. Our method outperforms\nthe state-of-the-art by a notable margin, especially on confusing and\nlong-tailed classes. Code is available at\n\\href{https://github.com/RainJamesY/DIDA}\n","authors":["Dongyu Yao","Boheng Li"],"pdf_url":"https://arxiv.org/pdf/2307.07972v2.pdf","comment":"Accepted to ICCVW on Uncertainty Quantification for Computer Vision\n  (UnCV), 2023"},{"id":"http://arxiv.org/abs/2308.05872v1","updated":"2023-08-10T22:57:31Z","published":"2023-08-10T22:57:31Z","title":"Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention","summary":"  Convolutional neural networks (CNNs) and vision transformers (ViTs) have\nachieved remarkable success in various vision tasks. However, many\narchitectures do not consider interactions between feature maps from different\nstages and scales, which may limit their performance. In this work, we propose\na simple add-on attention module to overcome these limitations via multi-stage\nand cross-scale interactions. Specifically, the proposed Multi-Stage\nCross-Scale Attention (\\meth) module takes feature maps from different stages\nto enable multi-stage interactions and achieves cross-scale interactions by\ncomputing self-attention at different scales based on the multi-stage feature\nmaps. Our experiments on several downstream tasks show that \\meth~provides a\nsignificant performance boost with modest additional FLOPs and runtime.\n","authors":["Liang Shang","Yanli Liu","Zhengyang Lou","Shuxue Quan","Nagesh Adluru","Bochen Guan","William A. Sethares"],"pdf_url":"https://arxiv.org/pdf/2308.05872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05864v1","updated":"2023-08-10T21:59:23Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n  Solutions","summary":"  Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyperparameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods, but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Zhuoshi Li","Chao Zuo","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v1.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n  https://neurips22-cellseg.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2308.05862v1","updated":"2023-08-10T21:51:48Z","published":"2023-08-10T21:51:48Z","title":"Unleashing the Strengths of Unlabeled Data in Pan-cancer Abdominal Organ\n  Quantification: the FLARE22 Challenge","summary":"  Quantitative organ assessment is an essential step in automated abdominal\ndisease diagnosis and treatment planning. Artificial intelligence (AI) has\nshown great potential to automatize this process. However, most existing AI\nalgorithms rely on many expert annotations and lack a comprehensive evaluation\nof accuracy and efficiency in real-world multinational settings. To overcome\nthese limitations, we organized the FLARE 2022 Challenge, the largest abdominal\norgan analysis challenge to date, to benchmark fast, low-resource, accurate,\nannotation-efficient, and generalized AI algorithms. We constructed an\nintercontinental and multinational dataset from more than 50 medical groups,\nincluding Computed Tomography (CT) scans with different races, diseases,\nphases, and manufacturers. We independently validated that a set of AI\nalgorithms achieved a median Dice Similarity Coefficient (DSC) of 90.0\\% by\nusing 50 labeled scans and 2000 unlabeled scans, which can significantly reduce\nannotation requirements. The best-performing algorithms successfully\ngeneralized to holdout external validation sets, achieving a median DSC of\n89.5\\%, 90.9\\%, and 88.3\\% on North American, European, and Asian cohorts,\nrespectively. They also enabled automatic extraction of key organ biology\nfeatures, which was labor-intensive with traditional manual measurements. This\nopens the potential to use unlabeled data to boost performance and alleviate\nannotation shortages for modern AI models.\n","authors":["Jun Ma","Yao Zhang","Song Gu","Cheng Ge","Shihao Ma","Adamo Young","Cheng Zhu","Kangkang Meng","Xin Yang","Ziyan Huang","Fan Zhang","Wentao Liu","YuanKe Pan","Shoujin Huang","Jiacheng Wang","Mingze Sun","Weixin Xu","Dengqiang Jia","Jae Won Choi","Natália Alves","Bram de Wilde","Gregor Koehler","Yajun Wu","Manuel Wiesenfarth","Qiongjie Zhu","Guoqiang Dong","Jian He","the FLARE Challenge Consortium","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05862v1.pdf","comment":"MICCAI FLARE22: https://flare22.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2003.03229v5","updated":"2023-08-10T21:19:32Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v5.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2212.04875v3","updated":"2023-08-10T21:05:54Z","published":"2022-12-09T14:29:57Z","title":"Expeditious Saliency-guided Mix-up through Random Gradient Thresholding","summary":"  Mix-up training approaches have proven to be effective in improving the\ngeneralization ability of Deep Neural Networks. Over the years, the research\ncommunity expands mix-up methods into two directions, with extensive efforts to\nimprove saliency-guided procedures but minimal focus on the arbitrary path,\nleaving the randomization domain unexplored. In this paper, inspired by the\nsuperior qualities of each direction over one another, we introduce a novel\nmethod that lies at the junction of the two routes. By combining the best\nelements of randomness and saliency utilization, our method balances speed,\nsimplicity, and accuracy. We name our method R-Mix following the concept of\n\"Random Mix-up\". We demonstrate its effectiveness in generalization, weakly\nsupervised object localization, calibration, and robustness to adversarial\nattacks. Finally, in order to address the question of whether there exists a\nbetter decision protocol, we train a Reinforcement Learning agent that decides\nthe mix-up policies based on the classifier's performance, reducing dependency\non human-designed objectives and hyperparameter tuning. Extensive experiments\nfurther show that the agent is capable of performing at the cutting-edge level,\nlaying the foundation for a fully automatic mix-up. Our code is released at\n[https://github.com/minhlong94/Random-Mixup].\n","authors":["Minh-Long Luu","Zeyi Huang","Eric P. Xing","Yong Jae Lee","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2212.04875v3.pdf","comment":"Accepted Long paper at 2nd Practical-DL Workshop at AAAI 2023"},{"id":"http://arxiv.org/abs/2308.05851v1","updated":"2023-08-10T20:35:48Z","published":"2023-08-10T20:35:48Z","title":"SegDA: Maximum Separable Segment Mask with Pseudo Labels for Domain\n  Adaptive Semantic Segmentation","summary":"  Unsupervised Domain Adaptation (UDA) aims to solve the problem of label\nscarcity of the target domain by transferring the knowledge from the label rich\nsource domain. Usually, the source domain consists of synthetic images for\nwhich the annotation is easily obtained using the well known computer graphics\ntechniques. However, obtaining annotation for real world images (target domain)\nrequire lot of manual annotation effort and is very time consuming because it\nrequires per pixel annotation. To address this problem we propose SegDA module\nto enhance transfer performance of UDA methods by learning the maximum\nseparable segment representation. This resolves the problem of identifying\nvisually similar classes like pedestrian/rider, sidewalk/road etc. We leveraged\nEquiangular Tight Frame (ETF) classifier inspired from Neural Collapse for\nmaximal separation between segment classes. This causes the source domain pixel\nrepresentation to collapse to a single vector forming a simplex vertices which\nare aligned to the maximal separable ETF classifier. We use this phenomenon to\npropose the novel architecture for domain adaptation of segment representation\nfor target domain. Additionally, we proposed to estimate the noise in labelling\nthe target domain images and update the decoder for noise correction which\nencourages the discovery of pixels for classes not identified in pseudo labels.\nWe have used four UDA benchmarks simulating synthetic-to-real,\ndaytime-to-nighttime, clear-to-adverse weather scenarios. Our proposed approach\noutperforms +2.2 mIoU on GTA -> Cityscapes, +2.0 mIoU on Synthia -> Cityscapes,\n+5.9 mIoU on Cityscapes -> DarkZurich, +2.6 mIoU on Cityscapes -> ACDC.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.05851v1.pdf","comment":"11 pages, 4 Tables, 3 Figures, accepted at ICCVW 2023 (ICCV 2023: 4th\n  Workshop on Visual Perception for Navigation in Human Environments)"},{"id":"http://arxiv.org/abs/2308.05846v1","updated":"2023-08-10T19:56:15Z","published":"2023-08-10T19:56:15Z","title":"Seed Kernel Counting using Domain Randomization and Object Tracking\n  Neural Networks","summary":"  High-throughput phenotyping (HTP) of seeds, also known as seed phenotyping,\nis the comprehensive assessment of complex seed traits such as growth,\ndevelopment, tolerance, resistance, ecology, yield, and the measurement of\nparameters that form more complex traits. One of the key aspects of seed\nphenotyping is cereal yield estimation that the seed production industry relies\nupon to conduct their business. While mechanized seed kernel counters are\navailable in the market currently, they are often priced high and sometimes\noutside the range of small scale seed production firms' affordability. The\ndevelopment of object tracking neural network models such as You Only Look Once\n(YOLO) enables computer scientists to design algorithms that can estimate\ncereal yield inexpensively. The key bottleneck with neural network models is\nthat they require a plethora of labelled training data before they can be put\nto task. We demonstrate that the use of synthetic imagery serves as a feasible\nsubstitute to train neural networks for object tracking that includes the tasks\nof object classification and detection. Furthermore, we propose a seed kernel\ncounter that uses a low-cost mechanical hopper, trained YOLOv8 neural network\nmodel, and object tracking algorithms on StrongSORT and ByteTrack to estimate\ncereal yield from videos. The experiment yields a seed kernel count with an\naccuracy of 95.2\\% and 93.2\\% for Soy and Wheat respectively using the\nStrongSORT algorithm, and an accuray of 96.8\\% and 92.4\\% for Soy and Wheat\nrespectively using the ByteTrack algorithm.\n","authors":["Venkat Margapuri","Prapti Thapaliya","Mitchell Neilsen"],"pdf_url":"https://arxiv.org/pdf/2308.05846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.13061v2","updated":"2023-08-10T18:54:19Z","published":"2021-05-27T11:07:09Z","title":"The Imaginative Generative Adversarial Network: Automatic Data\n  Augmentation for Dynamic Skeleton-Based Hand Gesture and Human Action\n  Recognition","summary":"  Deep learning approaches deliver state-of-the-art performance in recognition\nof spatiotemporal human motion data. However, one of the main challenges in\nthese recognition tasks is limited available training data. Insufficient\ntraining data results in over-fitting and data augmentation is one approach to\naddress this challenge. Existing data augmentation strategies based on scaling,\nshifting and interpolating offer limited generalizability and typically require\ndetailed inspection of the dataset as well as hundreds of GPU hours for\nhyperparameter optimization. In this paper, we present a novel automatic data\naugmentation model, the Imaginative Generative Adversarial Network (GAN), that\napproximates the distribution of the input data and samples new data from this\ndistribution. It is automatic in that it requires no data inspection and little\nhyperparameter tuning and therefore it is a low-cost and low-effort approach to\ngenerate synthetic data. We demonstrate our approach on small-scale\nskeleton-based datasets with a comprehensive experimental analysis. Our results\nshow that the augmentation strategy is fast to train and can improve\nclassification accuracy for both conventional neural networks and\nstate-of-the-art methods.\n","authors":["Junxiao Shen","John Dudley","Per Ola Kristensson"],"pdf_url":"https://arxiv.org/pdf/2105.13061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05822v1","updated":"2023-08-10T18:43:44Z","published":"2023-08-10T18:43:44Z","title":"Encode-Store-Retrieve: Enhancing Memory Augmentation through\n  Language-Encoded Egocentric Perception","summary":"  We depend on our own memory to encode, store, and retrieve our experiences.\nHowever, memory lapses can occur. One promising avenue for achieving memory\naugmentation is through the use of augmented reality head-mounted displays to\ncapture and preserve egocentric videos, a practice commonly referred to as life\nlogging. However, a significant challenge arises from the sheer volume of video\ndata generated through life logging, as the current technology lacks the\ncapability to encode and store such large amounts of data efficiently. Further,\nretrieving specific information from extensive video archives requires\nsubstantial computational power, further complicating the task of quickly\naccessing desired content. To address these challenges, we propose a memory\naugmentation system that involves leveraging natural language encoding for\nvideo data and storing them in a vector database. This approach harnesses the\npower of large vision language models to perform the language encoding process.\nAdditionally, we propose using large language models to facilitate natural\nlanguage querying. Our system underwent extensive evaluation using the QA-Ego4D\ndataset and achieved state-of-the-art results with a BLEU score of 8.3,\noutperforming conventional machine learning models that scored between 3.4 and\n5.8. Additionally, in a user study, our system received a higher mean response\nscore of 4.13/5 compared to the human participants' score of 2.46/5 on\nreal-life episodic memory tasks.\n","authors":["Junxiao Shen","John Dudley","Per Ola Kristensson"],"pdf_url":"https://arxiv.org/pdf/2308.05822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05820v1","updated":"2023-08-10T18:39:35Z","published":"2023-08-10T18:39:35Z","title":"Recognizing Handwritten Mathematical Expressions of Vertical Addition\n  and Subtraction","summary":"  Handwritten Mathematical Expression Recognition (HMER) is a challenging task\nwith many educational applications. Recent methods for HMER have been developed\nfor complex mathematical expressions in standard horizontal format. However,\nsolutions for elementary mathematical expression, such as vertical addition and\nsubtraction, have not been explored in the literature. This work proposes a new\nhandwritten elementary mathematical expression dataset composed of addition and\nsubtraction expressions in a vertical format. We also extended the MNIST\ndataset to generate artificial images with this structure. Furthermore, we\nproposed a solution for offline HMER, able to recognize vertical addition and\nsubtraction expressions. Our analysis evaluated the object detection algorithms\nYOLO v7, YOLO v8, YOLO-NAS, NanoDet and FCOS for identifying the mathematical\nsymbols. We also proposed a transcription method to map the bounding boxes from\nthe object detection stage to a mathematical expression in the LATEX markup\nsequence. Results show that our approach is efficient, achieving a high\nexpression recognition rate. The code and dataset are available at\nhttps://github.com/Danielgol/HME-VAS\n","authors":["Daniel Rosa","Filipe R. Cordeiro","Ruan Carvalho","Everton Souza","Sergio Chevtchenko","Luiz Rodrigues","Marcelo Marinho","Thales Vieira","Valmir Macario"],"pdf_url":"https://arxiv.org/pdf/2308.05820v1.pdf","comment":"Paper accepted at SIBGRAPI 2023"},{"id":"http://arxiv.org/abs/2308.05818v1","updated":"2023-08-10T18:35:22Z","published":"2023-08-10T18:35:22Z","title":"Absorption-Based, Passive Range Imaging from Hyperspectral Thermal\n  Measurements","summary":"  Passive hyperspectral long-wave infrared measurements are remarkably\ninformative about the surroundings, such as remote object material composition,\ntemperature, and range; and air temperature and gas concentrations. Remote\nobject material and temperature determine the spectrum of thermal radiance, and\nrange, air temperature, and gas concentrations determine how this spectrum is\nmodified by propagation to the sensor. We computationally separate these\nphenomena, introducing a novel passive range imaging method based on\natmospheric absorption of ambient thermal radiance. Previously demonstrated\npassive absorption-based ranging methods assume hot and highly emitting\nobjects. However, the temperature variation in natural scenes is usually low,\nmaking range imaging challenging. Our method benefits from explicit\nconsideration of air emission and parametric modeling of atmospheric\nabsorption. To mitigate noise in low-contrast scenarios, we jointly estimate\nrange and intrinsic object properties by exploiting a variety of absorption\nlines spread over the infrared spectrum. Along with Monte Carlo simulations\nthat demonstrate the importance of regularization, temperature differentials,\nand availability of many spectral bands, we apply this method to long-wave\ninfrared (8--13 $\\mu$m) hyperspectral image data acquired from natural scenes\nwith no active illumination. Range features from 15m to 150m are recovered,\nwith good qualitative match to unaligned lidar data.\n","authors":["Unay Dorken Gallastegi","Hoover Rueda-Chacon","Martin J. Stevens","Vivek K Goyal"],"pdf_url":"https://arxiv.org/pdf/2308.05818v1.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2201.09201v2","updated":"2023-08-10T18:34:17Z","published":"2022-01-23T07:18:55Z","title":"Vision-Based UAV Self-Positioning in Low-Altitude Urban Environments","summary":"  Unmanned Aerial Vehicles (UAVs) rely on satellite systems for stable\npositioning. However, due to limited satellite coverage or communication\ndisruptions, UAVs may lose signals from satellite-based positioning systems. In\nsuch situations, vision-based techniques can serve as an alternative, ensuring\nthe self-positioning capability of UAVs. However, most of the existing datasets\nare developed for the geo-localization tasks of the objects identified by UAVs,\nrather than the self-positioning task of UAVs. Furthermore, the current UAV\ndatasets use discrete sampling on synthetic data, such as Google Maps, thereby\nneglecting the crucial aspects of dense sampling and the uncertainties commonly\nexperienced in real-world scenarios. To address these issues, this paper\npresents a new dataset, DenseUAV, which is the first publicly available dataset\ndesigned for the UAV self-positioning task. DenseUAV adopts dense sampling on\nUAV images obtained in low-altitude urban settings. In total, over 27K UAV-view\nand satellite-view images of 14 university campuses are collected and\nannotated, establishing a new benchmark. In terms of model development, we\nfirst verify the superiority of Transformers over CNNs in this task. Then, we\nincorporate metric learning into representation learning to enhance the\ndiscriminative capacity of the model and to lessen the modality discrepancy.\nBesides, to facilitate joint learning from both perspectives, we propose a\nmutually supervised learning approach. Last, we enhance the Recall@K metric and\nintroduce a new measurement, SDM@K, to evaluate the performance of a trained\nmodel from both the retrieval and localization perspectives simultaneously. As\na result, the proposed baseline method achieves a remarkable Recall@1 score of\n83.05% and an SDM@1 score of 86.24% on DenseUAV. The dataset and code will be\nmade publicly available on https://github.com/Dmmm1997/DenseUAV.\n","authors":["Ming Dai","Enhui Zheng","Zhenhua Feng","Jiedong Zhuang","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2201.09201v2.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.05810v1","updated":"2023-08-10T18:09:44Z","published":"2023-08-10T18:09:44Z","title":"Spintronics for image recognition : performance benchmarking via\n  ultrafast data-driven simulations","summary":"  We present a demonstration of image classification using a hardware-based\necho-state network (ESN) that relies on spintronic nanostructures known as\nvortex-based spin-torque oscillators (STVOs). Our network is realized using a\nsingle STVO multiplexed in time. To circumvent the challenges associated with\nrepeated experimental manipulation of such a nanostructured system, we employ\nan ultrafast data-driven simulation framework called the data-driven Thiele\nequation approach (DD-TEA) to simulate the STVO dynamics. We use this approach\nto efficiently develop, optimize and test an STVO-based ESN for image\nclassification using the MNIST dataset. We showcase the versatility of our\nsolution by successfully applying it to solve classification challenges with\nthe EMNIST-letters and Fashion MNIST datasets. Through our simulations, we\ndetermine that within a large ESN the results obtained using the STVO dynamics\nas an activation function are comparable to the ones obtained with other\nconventional nonlinear activation functions like the reLU and the sigmoid.\nWhile achieving state-of-the-art accuracy levels on the MNIST dataset, our\nmodel's performance on EMNIST-letters and Fashion MNIST is lower due to the\nrelative simplicity of the system architecture and the increased complexity of\nthe tasks. We expect that the DD-TEA framework will enable the exploration of\nmore specialized neural architectures, ultimately leading to improved\nclassification accuracy. This approach also holds promise for investigating and\ndeveloping dedicated learning rules to further enhance classification\nperformance.\n","authors":["Anatole Moureaux","Chloé Chopin","Laurent Jacques","Flavio Abreu Araujo"],"pdf_url":"https://arxiv.org/pdf/2308.05810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05787v1","updated":"2023-08-10T17:35:47Z","published":"2023-08-10T17:35:47Z","title":"Temporally-Adaptive Models for Efficient Video Understanding","summary":"  Spatial convolutions are extensively used in numerous deep video models. It\nfundamentally assumes spatio-temporal invariance, i.e., using shared weights\nfor every location in different frames. This work presents Temporally-Adaptive\nConvolutions (TAdaConv) for video understanding, which shows that adaptive\nweight calibration along the temporal dimension is an efficient way to\nfacilitate modeling complex temporal dynamics in videos. Specifically, TAdaConv\nempowers spatial convolutions with temporal modeling abilities by calibrating\nthe convolution weights for each frame according to its local and global\ntemporal context. Compared to existing operations for temporal modeling,\nTAdaConv is more efficient as it operates over the convolution kernels instead\nof the features, whose dimension is an order of magnitude smaller than the\nspatial resolutions. Further, kernel calibration brings an increased model\ncapacity. Based on this readily plug-in operation TAdaConv as well as its\nextension, i.e., TAdaConvV2, we construct TAdaBlocks to empower ConvNeXt and\nVision Transformer to have strong temporal modeling capabilities. Empirical\nresults show TAdaConvNeXtV2 and TAdaFormer perform competitively against\nstate-of-the-art convolutional and Transformer-based models in various video\nunderstanding benchmarks. Our codes and models are released at:\nhttps://github.com/alibaba-mmai-research/TAdaConv.\n","authors":["Ziyuan Huang","Shiwei Zhang","Liang Pan","Zhiwu Qing","Yingya Zhang","Ziwei Liu","Marcelo H. Ang Jr"],"pdf_url":"https://arxiv.org/pdf/2308.05787v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2110.06178"},{"id":"http://arxiv.org/abs/2308.05785v1","updated":"2023-08-10T16:44:24Z","published":"2023-08-10T16:44:24Z","title":"Leverage Weakly Annotation to Pixel-wise Annotation via Zero-shot\n  Segment Anything Model for Molecular-empowered Learning","summary":"  Precise identification of multiple cell classes in high-resolution Giga-pixel\nwhole slide imaging (WSI) is critical for various clinical scenarios. Building\nan AI model for this purpose typically requires pixel-level annotations, which\nare often unscalable and must be done by skilled domain experts (e.g.,\npathologists). However, these annotations can be prone to errors, especially\nwhen distinguishing between intricate cell types (e.g., podocytes and mesangial\ncells) using only visual inspection. Interestingly, a recent study showed that\nlay annotators, when using extra immunofluorescence (IF) images for reference\n(referred to as molecular-empowered learning), can sometimes outperform domain\nexperts in labeling. Despite this, the resource-intensive task of manual\ndelineation remains a necessity during the annotation process. In this paper,\nwe explore the potential of bypassing pixel-level delineation by employing the\nrecent segment anything model (SAM) on weak box annotation in a zero-shot\nlearning approach. Specifically, we harness SAM's ability to produce\npixel-level annotations from box annotations and utilize these SAM-generated\nlabels to train a segmentation model. Our findings show that the proposed\nSAM-assisted molecular-empowered learning (SAM-L) can diminish the labeling\nefforts for lay annotators by only requiring weak box annotations. This is\nachieved without compromising annotation accuracy or the performance of the\ndeep learning-based segmentation. This research represents a significant\nadvancement in democratizing the annotation process for training pathological\nimage segmentation, relying solely on non-expert annotators.\n","authors":["Xueyuan Li","Ruining Deng","Yucheng Tang","Shunxing Bao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05784v1","updated":"2023-08-10T16:33:59Z","published":"2023-08-10T16:33:59Z","title":"High-performance Data Management for Whole Slide Image Analysis in\n  Digital Pathology","summary":"  When dealing with giga-pixel digital pathology in whole-slide imaging, a\nnotable proportion of data records holds relevance during each analysis\noperation. For instance, when deploying an image analysis algorithm on\nwhole-slide images (WSI), the computational bottleneck often lies in the\ninput-output (I/O) system. This is particularly notable as patch-level\nprocessing introduces a considerable I/O load onto the computer system.\nHowever, this data management process can be potentially further paralleled,\ngiven the typical independence of patch-level image processes across different\npatches. This paper details our endeavors in tackling this data access\nchallenge through the implementation of the Adaptable IO System version 2\n(ADIOS2). Our focus has been on constructing and releasing a digital\npathology-centric pipeline using ADIOS2, which facilitates streamlined data\nmanagement across WSIs. Additionally, we've developed strategies aimed at\ncurtailing data retrieval times. The performance evaluation encompasses two key\nscenarios: (1) a pure CPU-based image analysis scenario (termed the \"CPU\nscenario\"), and (2) a GPU-based deep learning framework scenario (referred to\nas the \"GPU scenario\"). Our findings reveal noteworthy outcomes. Under the CPU\nscenario, ADIOS2 showcases an impressive two-fold speed-up in comparison to the\nbrute-force approach. In the GPU scenario, its performance stands on par with\nthe cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct\nStorage (GDS). From what we know, this appears to be among the initial\ninstances, if any, of utilizing ADIOS2 within the field of digital pathology.\nThe source code has been made publicly available at\nhttps://github.com/hrlblab/adios.\n","authors":["Haoju Leng","Ruining Deng","Shunxing Bao","Dazheng Fang","Bryan A. Millis","Yucheng Tang","Haichun Yang","Lipeng Wan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05782v1","updated":"2023-08-10T16:26:03Z","published":"2023-08-10T16:26:03Z","title":"Multi-scale Multi-site Renal Microvascular Structures Segmentation for\n  Whole Slide Imaging in Renal Pathology","summary":"  Segmentation of microvascular structures, such as arterioles, venules, and\ncapillaries, from human kidney whole slide images (WSI) has become a focal\npoint in renal pathology. Current manual segmentation techniques are\ntime-consuming and not feasible for large-scale digital pathology images. While\ndeep learning-based methods offer a solution for automatic segmentation, most\nsuffer from a limitation: they are designed for and restricted to training on\nsingle-site, single-scale data. In this paper, we present Omni-Seg, a novel\nsingle dynamic network method that capitalizes on multi-site, multi-scale\ntraining data. Unique to our approach, we utilize partially labeled images,\nwhere only one tissue type is labeled per training image, to segment\nmicrovascular structures. We train a singular deep network using images from\ntwo datasets, HuBMAP and NEPTUNE, across different magnifications (40x, 20x,\n10x, and 5x). Experimental results indicate that Omni-Seg outperforms in terms\nof both the Dice Similarity Coefficient (DSC) and Intersection over Union\n(IoU). Our proposed method provides renal pathologists with a powerful\ncomputational tool for the quantitative analysis of renal microvascular\nstructures.\n","authors":["Franklin Hu","Ruining Deng","Shunxing Bao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05782v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.05697v1","updated":"2023-08-10T16:59:36Z","published":"2023-08-10T16:59:36Z","title":"SSLRec: A Self-Supervised Learning Library for Recommendation","summary":"  Self-supervised learning (SSL) has gained significant interest in recent\nyears as a solution to address the challenges posed by sparse and noisy data in\nrecommender systems. Despite the growing number of SSL algorithms designed to\nprovide state-of-the-art performance in various recommendation scenarios (e.g.,\ngraph collaborative filtering, sequential recommendation, social\nrecommendation, KG-enhanced recommendation), there is still a lack of unified\nframeworks that integrate recommendation algorithms across different domains.\nSuch a framework could serve as the cornerstone for self-supervised\nrecommendation algorithms, unifying the validation of existing methods and\ndriving the design of new ones. To address this gap, we introduce SSLRec, a\nnovel benchmark platform that provides a standardized, flexible, and\ncomprehensive framework for evaluating various SSL-enhanced recommenders. The\nSSLRec library features a modular architecture that allows users to easily\nevaluate state-of-the-art models and a complete set of data augmentation and\nself-supervised toolkits to help create SSL recommendation models with specific\nneeds. Furthermore, SSLRec simplifies the process of training and evaluating\ndifferent recommendation models with consistent and fair settings. Our SSLRec\nplatform covers a comprehensive set of state-of-the-art SSL-enhanced\nrecommendation models across different scenarios, enabling researchers to\nevaluate these cutting-edge models and drive further innovation in the field.\nOur implemented SSLRec framework is available at the source code repository\nhttps://github.com/HKUDS/SSLRec.\n","authors":["Xubin Ren","Lianghao Xia","Yuhao Yang","Wei Wei","Tianle Wang","Xuheng Cai","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n  Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":"  The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05609v1","updated":"2023-08-10T14:41:17Z","published":"2023-08-10T14:41:17Z","title":"LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition","summary":"  Biomedical Natural Language Processing (NLP) tends to become cumbersome for\nmost researchers, frequently due to the amount and heterogeneity of text to be\nprocessed. To address this challenge, the industry is continuously developing\nhighly efficient tools and creating more flexible engineering solutions. This\nwork presents the integration between industry data engineering solutions for\nefficient data processing and academic systems developed for Named Entity\nRecognition (LasigeUnicage\\_NER) and Relation Extraction (BiOnt). Our design\nreflects an integration of those components with external knowledge in the form\nof additional training data from other datasets and biomedical ontologies. We\nused this pipeline in the 2022 LitCoin NLP Challenge, where our team\nLasigeUnicage was awarded the 7th Prize out of approximately 200 participating\nteams, reflecting a successful collaboration between the academia (LASIGE) and\nthe industry (Unicage). The software supporting this work is available at\n\\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.\n","authors":["Pedro Ruas","Diana F. Sousa","André Neves","Carlos Cruz","Francisco M. Couto"],"pdf_url":"https://arxiv.org/pdf/2308.05609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05508v1","updated":"2023-08-10T11:41:34Z","published":"2023-08-10T11:41:34Z","title":"Multi-domain Recommendation with Embedding Disentangling and Domain\n  Alignment","summary":"  Multi-domain recommendation (MDR) aims to provide recommendations for\ndifferent domains (e.g., types of products) with overlapping users/items and is\ncommon for platforms such as Amazon, Facebook, and LinkedIn that host multiple\nservices. Existing MDR models face two challenges: First, it is difficult to\ndisentangle knowledge that generalizes across domains (e.g., a user likes cheap\nitems) and knowledge specific to a single domain (e.g., a user likes blue\nclothing but not blue cars). Second, they have limited ability to transfer\nknowledge across domains with small overlaps. We propose a new MDR method named\nEDDA with two key components, i.e., embedding disentangling recommender and\ndomain alignment, to tackle the two challenges respectively. In particular, the\nembedding disentangling recommender separates both the model and embedding for\nthe inter-domain part and the intra-domain part, while most existing MDR\nmethods only focus on model-level disentangling. The domain alignment leverages\nrandom walks from graph processing to identify similar user/item pairs from\ndifferent domains and encourages similar user/item pairs to have similar\nembeddings, enhancing knowledge transfer. We compare EDDA with 12\nstate-of-the-art baselines on 3 real datasets. The results show that EDDA\nconsistently outperforms the baselines on all datasets and domains. All\ndatasets and codes are available at https://github.com/Stevenn9981/EDDA.\n","authors":["Wentao Ning","Xiao Yan","Weiwen Liu","Reynold Cheng","Rui Zhang","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.05508v1.pdf","comment":"Accepted by CIKM'23"},{"id":"http://arxiv.org/abs/2308.05502v1","updated":"2023-08-10T11:14:22Z","published":"2023-08-10T11:14:22Z","title":"Bringing order into the realm of Transformer-based language models for\n  artificial intelligence and law","summary":"  Transformer-based language models (TLMs) have widely been recognized to be a\ncutting-edge technology for the successful development of deep-learning-based\nsolutions to problems and applications that require natural language processing\nand understanding. Like for other textual domains, TLMs have indeed pushed the\nstate-of-the-art of AI approaches for many tasks of interest in the legal\ndomain. Despite the first Transformer model being proposed about six years ago,\nthere has been a rapid progress of this technology at an unprecedented rate,\nwhereby BERT and related models represent a major reference, also in the legal\ndomain. This article provides the first systematic overview of TLM-based\nmethods for AI-driven problems and tasks in the legal sphere. A major goal is\nto highlight research advances in this field so as to understand, on the one\nhand, how the Transformers have contributed to the success of AI in supporting\nlegal processes, and on the other hand, what are the current limitations and\nopportunities for further research development.\n","authors":["Candida M. Greco","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2308.05502v1.pdf","comment":"Accepted for publication with Artificial Intelligence and Law,\n  Springer Nature"},{"id":"http://arxiv.org/abs/2304.03531v2","updated":"2023-08-10T10:52:39Z","published":"2023-04-07T08:09:50Z","title":"From Retrieval to Generation: Efficient and Effective Entity Set\n  Expansion","summary":"  Entity Set Expansion (ESE) is a critical task aiming to expand entities of\nthe target semantic class described by a small seed entity set. Most existing\nESE methods are retrieval-based frameworks that need to extract the contextual\nfeatures of entities and calculate the similarity between seed entities and\ncandidate entities. To achieve the two purposes, they should iteratively\ntraverse the corpus and the entity vocabulary provided in the datasets,\nresulting in poor efficiency and scalability. The experimental results indicate\nthat the time consumed by the retrieval-based ESE methods increases linearly\nwith entity vocabulary and corpus size. In this paper, we firstly propose a\ngenerative ESE framework, Generative Entity Set Expansion (GenExpan), which\nutilizes a generative pre-trained language model to accomplish ESE task.\nSpecifically, a prefix tree is employed to guarantee the validity of entity\ngeneration, and automatically generated class names are adopted to guide the\nmodel to generate target entities. Moreover, we propose Knowledge Calibration\nand Generative Ranking to further bridge the gap between generic knowledge of\nthe language model and the goal of ESE task. Experiments on publicly available\ndatasets show that GenExpan is efficient and effective. For efficiency,\nexpansion time consumed by GenExpan is independent of entity vocabulary and\ncorpus size, and GenExpan achieves an average 600% speedup compared to strong\nbaselines. For expansion performance, our framework outperforms previous\nstate-of-the-art ESE methods.\n","authors":["Shulin Huang","Shirong Ma","Yangning Li","Yinghui Li","Hai-Tao Zheng","Yong Jiang","Hong-Gee Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":"  In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n  eCom'22)"},{"id":"http://arxiv.org/abs/2308.05379v1","updated":"2023-08-10T06:52:53Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n  Self-supervised Learning","summary":"  Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything. In reality, auxiliary query-item interactions extracted from user\nhistorical behavior data of the search log could provide hints to reveal users'\nsearch intents further. Drawing inspiration from this, we devise a novel\nBehavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that\nleverages neighbor queries of target item and neighbor items of target query to\ncomplement target query-item semantic matching. Specifically, our model builds\nmulti-level co-attention for distilling coarse-grained and fine-grained\nsemantic representations from both neighbor and target views. The model\nsubsequently employs neighbor-target self-supervised learning to improve the\naccuracy and robustness of BARL-ASe by strengthening representation and logit\nlearning. Furthermore, we discuss how to deal with the long-tail query-item\nmatching of the mini apps search scenario of Alipay practically. Experiments on\nreal-world industry data and online A/B testing demonstrate our proposal\nachieves promising performance with low latency.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v1.pdf","comment":"CIKM2023"},{"id":"http://arxiv.org/abs/2303.11174v2","updated":"2023-08-10T05:40:01Z","published":"2023-03-14T01:58:04Z","title":"Metric Search for Rank List Compatibility Matching with Applications","summary":"  As online dating has become more popular in the past few years, an efficient\nand effective algorithm to match users is needed. In this project, we proposed\na new dating matching algorithm that uses Kendall-Tau distance to measure the\nsimilarity between users based on their ranking for items in a list. (e.g.,\ntheir favourite sports, music, etc.) To increase the performance of the search\nprocess, we applied a tree-based searching structure, Cascading Metric Tree\n(CMT), on this metric. The tree is built on ranked lists from all the users;\nwhen a query target and a radius are provided, our algorithm can return users\nwithin the radius of the target. We tested the scaling of this searching method\non a synthetic dataset by varying list length, population size, and query\nradius. We observed that the algorithm is able to query the best matching\npeople for the user in a practical time, given reasonable parameters. We also\nprovided potential future improvements that can be made to this algorithm based\non the limitations. Finally, we offered more use cases of this search structure\non Kendall-Tau distance and new insight into real-world applications of\ndistance search structures.\n","authors":["Wenqi Guo","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2303.11174v2.pdf","comment":"Paper for 2023 Multidisciplinary Undergraduate Research Conference\n  (MURC)"},{"id":"http://arxiv.org/abs/2308.05013v2","updated":"2023-08-10T02:30:44Z","published":"2023-08-09T15:11:46Z","title":"Dual Intents Graph Modeling for User-centric Group Discovery","summary":"  Online groups have become increasingly prevalent, providing users with space\nto share experiences and explore interests. Therefore, user-centric group\ndiscovery task, i.e., recommending groups to users can help both users' online\nexperiences and platforms' long-term developments. Existing recommender methods\ncan not deal with this task as modeling user-group participation into a\nbipartite graph overlooks their item-side interests. Although there exist a few\nworks attempting to address this task, they still fall short in fully\npreserving the social context and ensuring effective interest representation\nlearning.\n  In this paper, we focus on exploring the intents that motivate users to\nparticipate in groups, which can be categorized into different types, like the\nsocial-intent and the personal interest-intent. The former refers to users\njoining a group affected by their social links, while the latter relates to\nusers joining groups with like-minded people for self-enjoyment. To comprehend\ndifferent intents, we propose a novel model, DiRec, that first models each\nintent separately and then fuses them together for predictions. Specifically,\nfor social-intent, we introduce the hypergraph structure to model the\nrelationship between groups and members, leading to a richer understanding of\nthe social context. As for interest-intent, we employ novel structural\nrefinement on the interactive graph to uncover more intricate user behaviors\nand group interests, realizing better representation learning of interests.\nFurthermore, we also observe the intent overlapping in real-world scenarios and\ndevise a novel self-supervised learning loss that encourages such alignment for\nfinal recommendations. Extensive experiments on three public datasets show the\nsignificant improvement of DiRec over the state-of-the-art methods.\n","authors":["Xixi Wu","Yun Xiong","Yao Zhang","Yizhu Jiao","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05013v2.pdf","comment":"Accepted by CIKM'23 as Long Paper"},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n  wildfire season","summary":"  Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02542v2","updated":"2023-08-10T20:55:47Z","published":"2023-08-01T15:14:23Z","title":"Collaborative filtering to capture AI user's preferences as norms","summary":"  Customising AI technologies to each user's preferences is fundamental to them\nfunctioning well. Unfortunately, current methods require too much user\ninvolvement and fail to capture their true preferences. In fact, to avoid the\nnuisance of manually setting preferences, users usually accept the default\nsettings even if these do not conform to their true preferences. Norms can be\nuseful to regulate behaviour and ensure it adheres to user preferences but,\nwhile the literature has thoroughly studied norms, most proposals take a formal\nperspective. Indeed, while there has been some research on constructing norms\nto capture a user's privacy preferences, these methods rely on domain knowledge\nwhich, in the case of AI technologies, is difficult to obtain and maintain. We\nargue that a new perspective is required when constructing norms, which is to\nexploit the large amount of preference information readily available from whole\nsystems of users. Inspired by recommender systems, we believe that\ncollaborative filtering can offer a suitable approach to identifying a user's\nnorm preferences without excessive user involvement.\n","authors":["Marc Serramia","Natalia Criado","Michael Luck"],"pdf_url":"https://arxiv.org/pdf/2308.02542v2.pdf","comment":"Accepted manuscript at the 24th International Conference on\n  Principles and Practice of Multi-Agent Systems (PRIMA 2022)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.05741v1","updated":"2023-08-10T17:58:02Z","published":"2023-08-10T17:58:02Z","title":"Neural Progressive Meshes","summary":"  The recent proliferation of 3D content that can be consumed on hand-held\ndevices necessitates efficient tools for transmitting large geometric data,\ne.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a\nchallenge to storage as well as transmission bandwidth, and level-of-detail\ntechniques are often used to transmit an asset using an appropriate bandwidth\nbudget. It is especially desirable for these methods to transmit data\nprogressively, improving the quality of the geometry with more data. Our key\ninsight is that the geometric details of 3D meshes often exhibit similar local\npatterns even across different shapes, and thus can be effectively represented\nwith a shared learned generative space. We learn this space using a\nsubdivision-based encoder-decoder architecture trained in advance on a large\ncollection of surfaces. We further observe that additional residual features\ncan be transmitted progressively between intermediate levels of subdivision\nthat enable the client to control the tradeoff between bandwidth cost and\nquality of reconstruction, providing a neural progressive mesh representation.\nWe evaluate our method on a diverse set of complex 3D shapes and demonstrate\nthat it outperforms baselines in terms of compression ratio and reconstruction\nquality.\n","authors":["Yun-Chun Chen","Vladimir G. Kim","Noam Aigerman","Alec Jacobson"],"pdf_url":"https://arxiv.org/pdf/2308.05741v1.pdf","comment":"SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.05739v1","updated":"2023-08-10T17:57:22Z","published":"2023-08-10T17:57:22Z","title":"Zero Grads Ever Given: Learning Local Surrogate Losses for\n  Non-Differentiable Graphics","summary":"  Gradient-based optimization is now ubiquitous across graphics, but\nunfortunately can not be applied to problems with undefined or zero gradients.\nTo circumvent this issue, the loss function can be manually replaced by a\n\"surrogate\" that has similar minima but is differentiable. Our proposed\nframework, ZeroGrads, automates this process by learning a neural approximation\nof the objective function, the surrogate, which in turn can be used to\ndifferentiate through arbitrary black-box graphics pipelines. We train the\nsurrogate on an actively smoothed version of the objective and encourage\nlocality, focusing the surrogate's capacity on what matters at the current\ntraining episode. The fitting is performed online, alongside the parameter\noptimization, and self-supervised, without pre-computed data or pre-trained\nmodels. As sampling the objective is expensive (it requires a full rendering or\nsimulator run), we devise an efficient sampling scheme that allows for\ntractable run-times and competitive performance at little overhead. We\ndemonstrate optimizing diverse non-convex, non-differentiable black-box\nproblems in graphics, such as visibility in rendering, discrete parameter\nspaces in procedural modelling or optimal control in physics-driven animation.\nIn contrast to more traditional algorithms, our approach scales well to higher\ndimensions, which we demonstrate on problems with up to 35k interlinked\nvariables.\n","authors":["Michael Fischer","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2308.05739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05737v1","updated":"2023-08-10T17:57:06Z","published":"2023-08-10T17:57:06Z","title":"Follow Anything: Open-set detection, tracking, and following in\n  real-time","summary":"  Tracking and following objects of interest is critical to several robotics\nuse cases, ranging from industrial automation to logistics and warehousing, to\nhealthcare and security. In this paper, we present a robotic system to detect,\ntrack, and follow any object in real-time. Our approach, dubbed ``follow\nanything'' (FAn), is an open-vocabulary and multimodal model -- it is not\nrestricted to concepts seen at training time and can be applied to novel\nclasses at inference time using text, images, or click queries. Leveraging rich\nvisual descriptors from large-scale pre-trained models (foundation models), FAn\ncan detect and segment objects by matching multimodal queries (text, images,\nclicks) against an input image sequence. These detected and segmented objects\nare tracked across image frames, all while accounting for occlusion and object\nre-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial\nvehicle) and report its ability to seamlessly follow the objects of interest in\na real-time control loop. FAn can be deployed on a laptop with a lightweight\n(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To\nenable rapid adoption, deployment, and extensibility, we open-source all our\ncode on our project webpage at https://github.com/alaamaalouf/FollowAnything .\nWe also encourage the reader the watch our 5-minutes explainer video in this\nhttps://www.youtube.com/watch?v=6Mgt3EPytrw .\n","authors":["Alaa Maalouf","Ninad Jadhav","Krishna Murthy Jatavallabhula","Makram Chahine","Daniel M. Vogt","Robert J. Wood","Antonio Torralba","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2308.05737v1.pdf","comment":"Project webpage: https://github.com/alaamaalouf/FollowAnything\n  Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw"},{"id":"http://arxiv.org/abs/2308.05732v1","updated":"2023-08-10T17:53:05Z","published":"2023-08-10T17:53:05Z","title":"PDE-Refiner: Achieving Accurate Long Rollouts with Neural PDE Solvers","summary":"  Time-dependent partial differential equations (PDEs) are ubiquitous in\nscience and engineering. Recently, mostly due to the high computational cost of\ntraditional solution techniques, deep neural network based surrogates have\ngained increased interest. The practical utility of such neural PDE solvers\nrelies on their ability to provide accurate, stable predictions over long time\nhorizons, which is a notoriously hard problem. In this work, we present a\nlarge-scale analysis of common temporal rollout strategies, identifying the\nneglect of non-dominant spatial frequency information, often associated with\nhigh frequencies in PDE solutions, as the primary pitfall limiting stable,\naccurate rollout performance. Based on these insights, we draw inspiration from\nrecent advances in diffusion models to introduce PDE-Refiner; a novel model\nclass that enables more accurate modeling of all frequency components via a\nmultistep refinement process. We validate PDE-Refiner on challenging benchmarks\nof complex fluid dynamics, demonstrating stable and accurate rollouts that\nconsistently outperform state-of-the-art models, including neural, numerical,\nand hybrid neural-numerical architectures. We further demonstrate that\nPDE-Refiner greatly enhances data efficiency, since the denoising objective\nimplicitly induces a novel form of spectral data augmentation. Finally,\nPDE-Refiner's connection to diffusion models enables an accurate and efficient\nassessment of the model's predictive uncertainty, allowing us to estimate when\nthe surrogate becomes inaccurate.\n","authors":["Phillip Lippe","Bastiaan S. Veeling","Paris Perdikaris","Richard E. Turner","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2308.05732v1.pdf","comment":"Project website: https://phlippe.github.io/PDERefiner/"},{"id":"http://arxiv.org/abs/2308.05731v1","updated":"2023-08-10T17:53:03Z","published":"2023-08-10T17:53:03Z","title":"Rethinking Integration of Prediction and Planning in Deep Learning-Based\n  Automated Driving Systems: A Review","summary":"  Automated driving has the potential to revolutionize personal, public, and\nfreight mobility. Besides the enormous challenge of perception, i.e. accurately\nperceiving the environment using available sensor data, automated driving\ncomprises planning a safe, comfortable, and efficient motion trajectory. To\npromote safety and progress, many works rely on modules that predict the future\nmotion of surrounding traffic. Modular automated driving systems commonly\nhandle prediction and planning as sequential separate tasks. While this\naccounts for the influence of surrounding traffic on the ego-vehicle, it fails\nto anticipate the reactions of traffic participants to the ego-vehicle's\nbehavior. Recent works suggest that integrating prediction and planning in an\ninterdependent joint step is necessary to achieve safe, efficient, and\ncomfortable driving. While various models implement such integrated systems, a\ncomprehensive overview and theoretical understanding of different principles\nare lacking. We systematically review state-of-the-art deep learning-based\nprediction, planning, and integrated prediction and planning models. Different\nfacets of the integration ranging from model architecture and model design to\nbehavioral aspects are considered and related to each other. Moreover, we\ndiscuss the implications, strengths, and limitations of different integration\nmethods. By pointing out research gaps, describing relevant future challenges,\nand highlighting trends in the research field, we identify promising directions\nfor future research.\n","authors":["Steffen Hagedorn","Marcel Hallgarten","Martin Stoll","Alexandru Condurache"],"pdf_url":"https://arxiv.org/pdf/2308.05731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05725v1","updated":"2023-08-10T17:41:19Z","published":"2023-08-10T17:41:19Z","title":"EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech\n  Resynthesis","summary":"  Recent work has shown that it is possible to resynthesize high-quality speech\nbased, not on text, but on low bitrate discrete units that have been learned in\na self-supervised fashion and can therefore capture expressive aspects of\nspeech that are hard to transcribe (prosody, voice styles, non-verbal\nvocalization). The adoption of these methods is still limited by the fact that\nmost speech synthesis datasets are read, severely limiting spontaneity and\nexpressivity. Here, we introduce Expresso, a high-quality expressive speech\ndataset for textless speech synthesis that includes both read speech and\nimprovised dialogues rendered in 26 spontaneous expressive styles. We\nillustrate the challenges and potentials of this dataset with an expressive\nresynthesis benchmark where the task is to encode the input in low-bitrate\nunits and resynthesize it in a target voice while preserving content and style.\nWe evaluate resynthesis quality with automatic metrics for different\nself-supervised discrete encoders, and explore tradeoffs between quality,\nbitrate and invariance to speaker and style. All the dataset, evaluation\nmetrics and baseline models are open source\n","authors":["Tu Anh Nguyen","Wei-Ning Hsu","Antony D'Avirro","Bowen Shi","Itai Gat","Maryam Fazel-Zarani","Tal Remez","Jade Copet","Gabriel Synnaeve","Michael Hassid","Felix Kreuk","Yossi Adi","Emmanuel Dupoux"],"pdf_url":"https://arxiv.org/pdf/2308.05725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05724v1","updated":"2023-08-10T17:39:51Z","published":"2023-08-10T17:39:51Z","title":"Optimizing Performance of Feedforward and Convolutional Neural Networks\n  through Dynamic Activation Functions","summary":"  Deep learning training training algorithms are a huge success in recent years\nin many fields including speech, text,image video etc. Deeper and deeper layers\nare proposed with huge success with resnet structures having around 152 layers.\nShallow convolution neural networks(CNN's) are still an active research, where\nsome phenomena are still unexplained. Activation functions used in the network\nare of utmost importance, as they provide non linearity to the networks. Relu's\nare the most commonly used activation function.We show a complex piece-wise\nlinear(PWL) activation in the hidden layer. We show that these PWL activations\nwork much better than relu activations in our networks for convolution neural\nnetworks and multilayer perceptrons. Result comparison in PyTorch for shallow\nand deep CNNs are given to further strengthen our case.\n","authors":["Chinmay Rane","Kanishka Tyagi","Michael Manry"],"pdf_url":"https://arxiv.org/pdf/2308.05724v1.pdf","comment":"Under submission in Neurocomputing"},{"id":"http://arxiv.org/abs/2301.10822v2","updated":"2023-08-10T17:34:58Z","published":"2023-01-25T20:49:12Z","title":"RobustPdM: Designing Robust Predictive Maintenance against Adversarial\n  Attacks","summary":"  The state-of-the-art predictive maintenance (PdM) techniques have shown great\nsuccess in reducing maintenance costs and downtime of complicated machines\nwhile increasing overall productivity through extensive utilization of\nInternet-of-Things (IoT) and Deep Learning (DL). Unfortunately, IoT sensors and\nDL algorithms are both prone to cyber-attacks. For instance, DL algorithms are\nknown for their susceptibility to adversarial examples. Such adversarial\nattacks are vastly under-explored in the PdM domain. This is because the\nadversarial attacks in the computer vision domain for classification tasks\ncannot be directly applied to the PdM domain for multivariate time series (MTS)\nregression tasks. In this work, we propose an end-to-end methodology to design\nadversarially robust PdM systems by extensively analyzing the effect of\ndifferent types of adversarial attacks and proposing a novel adversarial\ndefense technique for DL-enabled PdM models. First, we propose novel MTS\nProjected Gradient Descent (PGD) and MTS PGD with random restarts (PGD_r)\nattacks. Then, we evaluate the impact of MTS PGD and PGD_r along with MTS Fast\nGradient Sign Method (FGSM) and MTS Basic Iterative Method (BIM) on Long\nShort-Term Memory (LSTM), Gated Recurrent Unit (GRU), Convolutional Neural\nNetwork (CNN), and Bi-directional LSTM based PdM system. Our results using\nNASA's turbofan engine dataset show that adversarial attacks can cause a severe\ndefect (up to 11X) in the RUL prediction, outperforming the effectiveness of\nthe state-of-the-art PdM attacks by 3X. Furthermore, we present a novel\napproximate adversarial training method to defend against adversarial attacks.\nWe observe that approximate adversarial training can significantly improve the\nrobustness of PdM models (up to 54X) and outperforms the state-of-the-art PdM\ndefense methods by offering 3X more robustness.\n","authors":["Ayesha Siddique","Ripan Kumar Kundu","Gautam Raj Mode","Khaza Anuarul Hoque"],"pdf_url":"https://arxiv.org/pdf/2301.10822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14961v3","updated":"2023-08-10T17:34:48Z","published":"2023-03-27T07:52:58Z","title":"Diffusion Denoised Smoothing for Certified and Adversarial Robust\n  Out-Of-Distribution Detection","summary":"  As the use of machine learning continues to expand, the importance of\nensuring its safety cannot be overstated. A key concern in this regard is the\nability to identify whether a given sample is from the training distribution,\nor is an \"Out-Of-Distribution\" (OOD) sample. In addition, adversaries can\nmanipulate OOD samples in ways that lead a classifier to make a confident\nprediction. In this study, we present a novel approach for certifying the\nrobustness of OOD detection within a $\\ell_2$-norm around the input, regardless\nof network architecture and without the need for specific components or\nadditional training. Further, we improve current techniques for detecting\nadversarial attacks on OOD samples, while providing high levels of certified\nand adversarial robustness on in-distribution samples. The average of all OOD\ndetection metrics on CIFAR10/100 shows an increase of $\\sim 13 \\% / 5\\%$\nrelative to previous approaches.\n","authors":["Nicola Franco","Daniel Korth","Jeanette Miriam Lorenz","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2303.14961v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05711v1","updated":"2023-08-10T17:20:02Z","published":"2023-08-10T17:20:02Z","title":"A Comparison of Classical and Deep Reinforcement Learning Methods for\n  HVAC Control","summary":"  Reinforcement learning (RL) is a promising approach for optimizing HVAC\ncontrol. RL offers a framework for improving system performance, reducing\nenergy consumption, and enhancing cost efficiency. We benchmark two popular\nclassical and deep RL methods (Q-Learning and Deep-Q-Networks) across multiple\nHVAC environments and explore the practical consideration of model\nhyper-parameter selection and reward tuning. The findings provide insight for\nconfiguring RL agents in HVAC systems, promoting energy-efficient and\ncost-effective operation.\n","authors":["Marshall Wang","John Willes","Thomas Jiralerspong","Matin Moezzi"],"pdf_url":"https://arxiv.org/pdf/2308.05711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05707v1","updated":"2023-08-10T17:14:07Z","published":"2023-08-10T17:14:07Z","title":"Shadow Datasets, New challenging datasets for Causal Representation\n  Learning","summary":"  Discovering causal relations among semantic factors is an emergent topic in\nrepresentation learning. Most causal representation learning (CRL) methods are\nfully supervised, which is impractical due to costly labeling. To resolve this\nrestriction, weakly supervised CRL methods were introduced. To evaluate CRL\nperformance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and\nCelebA(SMILE), are utilized. However, existing CRL datasets are limited to\nsimple graphs with few generative factors. Thus we propose two new datasets\nwith a larger number of diverse generative factors and more sophisticated\ncausal graphs. In addition, current real datasets, CelebA(BEARD) and\nCelebA(SMILE), the originally proposed causal graphs are not aligned with the\ndataset distributions. Thus, we propose modifications to them.\n","authors":["Jiageng Zhu","Hanchen Xie","Jianhua Wu","Jiazhi Li","Mahyar Khayatkhoei","Mohamed E. Hussein","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2308.05707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03152v2","updated":"2023-08-10T17:01:37Z","published":"2023-08-06T15:59:30Z","title":"AI-GOMS: Large AI-Driven Global Ocean Modeling System","summary":"  Ocean modeling is a powerful tool for simulating the physical, chemical, and\nbiological processes of the ocean, which is the foundation for marine science\nresearch and operational oceanography. Modern numerical ocean modeling mainly\nconsists of governing equations and numerical algorithms. Nonlinear\ninstability, computational expense, low reusability efficiency and high\ncoupling costs have gradually become the main bottlenecks for the further\ndevelopment of numerical ocean modeling. Recently, artificial\nintelligence-based modeling in scientific computing has shown revolutionary\npotential for digital twins and scientific simulations, but the bottlenecks of\nnumerical ocean modeling have not been further solved. Here, we present\nAI-GOMS, a large AI-driven global ocean modeling system, for accurate and\nefficient global ocean daily prediction. AI-GOMS consists of a backbone model\nwith the Fourier-based Masked Autoencoder structure for basic ocean variable\nprediction and lightweight fine-tuning models incorporating regional\ndownscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has\nachieved the best performance in 30 days of prediction for the global ocean\nbasic variables with 15 depth layers at 1/4{\\deg} spatial resolution. Beyond\nthe good performance in statistical metrics, AI-GOMS realizes the simulation of\nmesoscale eddies in the Kuroshio region at 1/12{\\deg} spatial resolution and\nocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new\nbackbone-downstream paradigm for Earth system modeling, which makes the system\ntransferable, scalable and reusable.\n","authors":["Wei Xiong","Yanfei Xiang","Hao Wu","Shuyi Zhou","Yuze Sun","Muyuan Ma","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14679v2","updated":"2023-08-10T16:46:35Z","published":"2023-02-28T15:42:30Z","title":"Synthesizing Mixed-type Electronic Health Records using Diffusion Models","summary":"  Electronic Health Records (EHRs) contain sensitive patient information, which\npresents privacy concerns when sharing such data. Synthetic data generation is\na promising solution to mitigate these risks, often relying on deep generative\nmodels such as Generative Adversarial Networks (GANs). However, recent studies\nhave shown that diffusion models offer several advantages over GANs, such as\ngeneration of more realistic synthetic data and stable training in generating\ndata modalities, including image, text, and sound. In this work, we investigate\nthe potential of diffusion models for generating realistic mixed-type tabular\nEHRs, comparing TabDDPM model with existing methods on four datasets in terms\nof data quality, utility, privacy, and augmentation. Our experiments\ndemonstrate that TabDDPM outperforms the state-of-the-art models across all\nevaluation metrics, except for privacy, which confirms the trade-off between\nprivacy and utility.\n","authors":["Taha Ceritli","Ghadeer O. Ghosheh","Vinod Kumar Chauhan","Tingting Zhu","Andrew P. Creagh","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2302.14679v2.pdf","comment":"Page 2, Figure 1 is updated"},{"id":"http://arxiv.org/abs/2308.05681v1","updated":"2023-08-10T16:34:20Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n  Recognition with Skeleton-Motion-Informed Gradient","summary":"  Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n  Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":"  The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11122v3","updated":"2023-08-10T16:30:52Z","published":"2023-05-18T17:09:21Z","title":"Autonomous sputter synthesis of thin film nitrides with composition\n  controlled by Bayesian optimization of optical plasma emission","summary":"  Autonomous experimentation has emerged as an efficient approach to accelerate\nthe pace of materials discovery. Although instruments for autonomous synthesis\nhave become popular in molecular and polymer science, solution processing of\nhybrid materials and nanoparticles, examples of autonomous tools for physical\nvapor deposition are scarce yet important for the semiconductor industry. Here,\nwe report the design and implementation of an autonomous workflow for sputter\ndeposition of thin films with controlled composition, leveraging a highly\nautomated sputtering reactor custom-controlled by Python, optical emission\nspectroscopy (OES), and a Bayesian optimization algorithm. We modeled film\ncomposition, measured by x-ray fluorescence, as a linear function of emission\nlines monitored during the co-sputtering from elemental Zn and Ti targets in\nN$_2$ atmosphere. A Bayesian control algorithm, informed by OES, navigates the\nspace of sputtering power to fabricate films with user-defined composition, by\nminimizing the absolute error between desired and measured emission signals. We\nvalidated our approach by autonomously fabricating Zn$_x$Ti$_{1-x}$N$_y$ films\nwith deviations from the targeted cation composition within relative 3.5 %,\neven for 15 nm thin films, demonstrating that the proposed approach can\nreliably synthesize thin films with specific composition and minimal human\ninterference. Moreover, the proposed method can be extended to more difficult\nsynthesis experiments where plasma intensity depends non-linearly on pressure,\nor the elemental sticking coefficients strongly depend on the substrate\ntemperature.\n","authors":["Davi M. Febba","Kevin R. Talley","Kendal Johnson","Stephen Schaefer","Sage R. Bauers","John S. Mangum","Rebecca W. Smaha","Andriy Zakutayev"],"pdf_url":"https://arxiv.org/pdf/2305.11122v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v2","updated":"2023-08-10T16:23:03Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach roughly\nhuman-level accuracy on ImageNet. Human-level competence is thus achievable for\na fundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v2.pdf","comment":"v2 adds an Appendix containing results with alternative scaling\n  functions; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2302.00453v2","updated":"2023-08-10T16:09:55Z","published":"2023-02-01T13:57:32Z","title":"Width and Depth Limits Commute in Residual Networks","summary":"  We show that taking the width and depth to infinity in a deep neural network\nwith skip connections, when branches are scaled by $1/\\sqrt{depth}$ (the only\nnontrivial scaling), result in the same covariance structure no matter how that\nlimit is taken. This explains why the standard infinite-width-then-depth\napproach provides practical insights even for networks with depth of the same\norder as width. We also demonstrate that the pre-activations, in this case,\nhave Gaussian distributions which has direct applications in Bayesian deep\nlearning. We conduct extensive simulations that show an excellent match with\nour theoretical findings.\n","authors":["Soufiane Hayou","Greg Yang"],"pdf_url":"https://arxiv.org/pdf/2302.00453v2.pdf","comment":"24 pages, 8 figures. arXiv admin note: text overlap with\n  arXiv:2210.00688"},{"id":"http://arxiv.org/abs/2305.03829v4","updated":"2023-08-10T15:51:03Z","published":"2023-05-05T20:08:40Z","title":"Improving Image-Based Precision Medicine with Uncertainty-Aware Causal\n  Models","summary":"  Image-based precision medicine aims to personalize treatment decisions based\non an individual's unique imaging features so as to improve their clinical\noutcome. Machine learning frameworks that integrate uncertainty estimation as\npart of their treatment recommendations would be safer and more reliable.\nHowever, little work has been done in adapting uncertainty estimation\ntechniques and validation metrics for precision medicine. In this paper, we use\nBayesian deep learning for estimating the posterior distribution over factual\nand counterfactual outcomes on several treatments. This allows for estimating\nthe uncertainty for each treatment option and for the individual treatment\neffects (ITE) between any two treatments. We train and evaluate this model to\npredict future new and enlarging T2 lesion counts on a large, multi-center\ndataset of MR brain images of patients with multiple sclerosis, exposed to\nseveral treatments during randomized controlled trials. We evaluate the\ncorrelation of the uncertainty estimate with the factual error, and, given the\nlack of ground truth counterfactual outcomes, demonstrate how uncertainty for\nthe ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate\nhow knowledge of uncertainty could modify clinical decision-making to improve\nindividual patient and clinical trial outcomes.\n","authors":["Joshua Durso-Finley","Jean-Pierre Falet","Raghav Mehta","Douglas L. Arnold","Nick Pawlowski","Tal Arbel"],"pdf_url":"https://arxiv.org/pdf/2305.03829v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05646v1","updated":"2023-08-10T15:43:46Z","published":"2023-08-10T15:43:46Z","title":"AST-MHSA : Code Summarization using Multi-Head Self-Attention","summary":"  Code summarization aims to generate concise natural language descriptions for\nsource code. The prevailing approaches adopt transformer-based encoder-decoder\narchitectures, where the Abstract Syntax Tree (AST) of the source code is\nutilized for encoding structural information. However, ASTs are much longer\nthan the corresponding source code, and existing methods ignore this size\nconstraint by directly feeding the entire linearized AST into the encoders.\nThis simplistic approach makes it challenging to extract truly valuable\ndependency relations from the overlong input sequence and leads to significant\ncomputational overhead due to self-attention applied to all nodes in the AST.\n  To address this issue effectively and efficiently, we present a model,\nAST-MHSA that uses multi-head attention to extract the important semantic\ninformation from the AST. The model consists of two main components: an encoder\nand a decoder. The encoder takes as input the abstract syntax tree (AST) of the\ncode and generates a sequence of hidden states. The decoder then takes these\nhidden states as input and generates a natural language summary of the code.\n  The multi-head attention mechanism allows the model to learn different\nrepresentations of the input code, which can be combined to generate a more\ncomprehensive summary. The model is trained on a dataset of code and summaries,\nand the parameters of the model are optimized to minimize the loss between the\ngenerated summaries and the ground-truth summaries.\n","authors":["Yeshwanth Nagaraj","Ujjwal Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.05646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02279v2","updated":"2023-08-10T15:30:15Z","published":"2023-07-05T13:26:17Z","title":"From NeurODEs to AutoencODEs: a mean-field control framework for\n  width-varying Neural Networks","summary":"  The connection between Residual Neural Networks (ResNets) and continuous-time\ncontrol systems (known as NeurODEs) has led to a mathematical analysis of\nneural networks which has provided interesting results of both theoretical and\npractical significance. However, by construction, NeurODEs have been limited to\ndescribing constant-width layers, making them unsuitable for modeling deep\nlearning architectures with layers of variable width. In this paper, we propose\na continuous-time Autoencoder, which we call AutoencODE, based on a\nmodification of the controlled field that drives the dynamics. This adaptation\nenables the extension of the mean-field control framework originally devised\nfor conventional NeurODEs. In this setting, we tackle the case of low Tikhonov\nregularization, resulting in potentially non-convex cost landscapes. While the\nglobal results obtained for high Tikhonov regularization may not hold globally,\nwe show that many of them can be recovered in regions where the loss function\nis locally convex. Inspired by our theoretical findings, we develop a training\nmethod tailored to this specific type of Autoencoders with residual\nconnections, and we validate our approach through numerical experiments\nconducted on various examples.\n","authors":["Cristina Cipriani","Massimo Fornasier","Alessandro Scagliotti"],"pdf_url":"https://arxiv.org/pdf/2307.02279v2.pdf","comment":"35 pages, 11 figures. Minor adjustments and new bibliographical\n  references"},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n  Transformer","summary":"  Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05629v1","updated":"2023-08-10T15:18:16Z","published":"2023-08-10T15:18:16Z","title":"ReLU and Addition-based Gated RNN","summary":"  We replace the multiplication and sigmoid function of the conventional\nrecurrent gate with addition and ReLU activation. This mechanism is designed to\nmaintain long-term memory for sequence processing but at a reduced\ncomputational cost, thereby opening up for more efficient execution or larger\nmodels on restricted hardware. Recurrent Neural Networks (RNNs) with gating\nmechanisms such as LSTM and GRU have been widely successful in learning from\nsequential data due to their ability to capture long-term dependencies.\nConventionally, the update based on current inputs and the previous state\nhistory is each multiplied with dynamic weights and combined to compute the\nnext state. However, multiplication can be computationally expensive,\nespecially for certain hardware architectures or alternative arithmetic systems\nsuch as homomorphic encryption. It is demonstrated that the novel gating\nmechanism can capture long-term dependencies for a standard synthetic sequence\nlearning task while significantly reducing computational costs such that\nexecution time is reduced by half on CPU and by one-third under encryption.\nExperimental results on handwritten text recognition tasks furthermore show\nthat the proposed architecture can be trained to achieve comparable accuracy to\nconventional GRU and LSTM baselines. The gating mechanism introduced in this\npaper may enable privacy-preserving AI applications operating under homomorphic\nencryption by avoiding the multiplication of encrypted variables. It can also\nsupport quantization in (unencrypted) plaintext applications, with the\npotential for substantial performance gains since the addition-based\nformulation can avoid the expansion to double precision often required for\nmultiplication.\n","authors":["Rickard Brännvall","Henrik Forsgren","Fredrik Sandin","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.05629v1.pdf","comment":"12 pages, 4 tables"},{"id":"http://arxiv.org/abs/2304.05874v2","updated":"2023-08-10T15:13:58Z","published":"2023-04-12T14:13:09Z","title":"Adaptive Gated Graph Convolutional Network for Explainable Diagnosis of\n  Alzheimer's Disease using EEG Data","summary":"  Graph neural network (GNN) models are increasingly being used for the\nclassification of electroencephalography (EEG) data. However, GNN-based\ndiagnosis of neurological disorders, such as Alzheimer's disease (AD), remains\na relatively unexplored area of research. Previous studies have relied on\nfunctional connectivity methods to infer brain graph structures and used simple\nGNN architectures for the diagnosis of AD. In this work, we propose a novel\nadaptive gated graph convolutional network (AGGCN) that can provide explainable\npredictions. AGGCN adaptively learns graph structures by combining\nconvolution-based node feature enhancement with a well-known correlation-based\nmeasure of functional connectivity. Furthermore, the gated graph convolution\ncan dynamically weigh the contribution of various spatial scales. The proposed\nmodel achieves high accuracy in both eyes-closed and eyes-open conditions,\nindicating the stability of learned representations. Finally, we demonstrate\nthat the proposed AGGCN model generates consistent explanations of its\npredictions that might be relevant for further study of AD-related alterations\nof brain networks.\n","authors":["Dominik Klepl","Fei He","Min Wu","Daniel J. Blackburn","Ptolemaios G. Sarrigiannis"],"pdf_url":"https://arxiv.org/pdf/2304.05874v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05621v1","updated":"2023-08-10T15:10:08Z","published":"2023-08-10T15:10:08Z","title":"Normalized Gradients for All","summary":"  In this short note, I show how to adapt to H\\\"{o}lder smoothness using\nnormalized gradients in a black-box way. Moreover, the bound will depend on a\nnovel notion of local H\\\"{o}lder smoothness. The main idea directly comes from\nLevy [2017].\n","authors":["Francesco Orabona"],"pdf_url":"https://arxiv.org/pdf/2308.05621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05619v1","updated":"2023-08-10T15:08:13Z","published":"2023-08-10T15:08:13Z","title":"Updating Clinical Risk Stratification Models Using Rank-Based\n  Compatibility: Approaches for Evaluating and Optimizing Clinician-Model Team\n  Performance","summary":"  As data shift or new data become available, updating clinical machine\nlearning models may be necessary to maintain or improve performance over time.\nHowever, updating a model can introduce compatibility issues when the behavior\nof the updated model does not align with user expectations, resulting in poor\nuser-model team performance. Existing compatibility measures depend on model\ndecision thresholds, limiting their applicability in settings where models are\nused to generate rankings based on estimated risk. To address this limitation,\nwe propose a novel rank-based compatibility measure, $C^R$, and a new loss\nfunction that aims to optimize discriminative performance while encouraging\ngood compatibility. Applied to a case study in mortality risk stratification\nleveraging data from MIMIC, our approach yields more compatible models while\nmaintaining discriminative performance compared to existing model selection\ntechniques, with an increase in $C^R$ of $0.019$ ($95\\%$ confidence interval:\n$0.005$, $0.035$). This work provides new tools to analyze and update risk\nstratification models used in clinical care.\n","authors":["Erkin Ötleş","Brian T. Denton","Jenna Wiens"],"pdf_url":"https://arxiv.org/pdf/2308.05619v1.pdf","comment":"Conference paper accepted at the 2023 Machine Learning for Healthcare\n  Conference Includes supplemental: 32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2305.11509v4","updated":"2023-08-10T15:01:16Z","published":"2023-05-19T08:18:49Z","title":"From Random Search to Bandit Learning in Metric Measure Spaces","summary":"  Random Search is one of the most widely-used method for Hyperparameter\nOptimization, and is critical to the success of deep learning models. Despite\nits astonishing performance, little non-heuristic theory has been developed to\ndescribe the underlying working mechanism. This paper gives a theoretical\naccounting of Random Search. We introduce the concept of \\emph{scattering\ndimension} that describes the landscape of the underlying function, and\nquantifies the performance of random search. We show that, when the environment\nis noise-free, the output of random search converges to the optimal value in\nprobability at rate $ \\widetilde{\\mathcal{O}} \\left( \\left( \\frac{1}{T}\n\\right)^{ \\frac{1}{d_s} } \\right) $, where $ d_s \\ge 0 $ is the scattering\ndimension of the underlying function. When the observed function values are\ncorrupted by bounded $iid$ noise, the output of random search converges to the\noptimal value in probability at rate $ \\widetilde{\\mathcal{O}} \\left( \\left(\n\\frac{1}{T} \\right)^{ \\frac{1}{d_s + 1} } \\right) $. In addition, based on the\nprinciples of random search, we introduce an algorithm, called BLiN-MOS, for\nLipschitz bandits in doubling metric spaces that are also endowed with a\nprobability measure, and show that BLiN-MOS achieves a regret rate of order $\n\\widetilde{\\mathcal{O}} \\left( T^{ \\frac{d_z}{d_z + 1} } \\right) $, where $d_z$\nis the zooming dimension of the problem instance.\n","authors":["Chuying Han","Yasong Feng","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11509v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12932v2","updated":"2023-08-10T14:44:01Z","published":"2023-05-22T11:25:24Z","title":"Forecasting Irregularly Sampled Time Series using Graphs","summary":"  Forecasting irregularly sampled time series with missing values is a crucial\ntask for numerous real-world applications such as healthcare, astronomy, and\nclimate sciences. State-of-the-art approaches to this problem rely on Ordinary\nDifferential Equations (ODEs) which are known to be slow and often require\nadditional features to handle missing values. To address this issue, we propose\na novel model using Graphs for Forecasting Irregularly Sampled Time Series with\nmissing values which we call GraFITi. GraFITi first converts the time series to\na Sparsity Structure Graph which is a sparse bipartite graph, and then\nreformulates the forecasting problem as the edge weight prediction task in the\ngraph. It uses the power of Graph Neural Networks to learn the graph and\npredict the target edge weights. GraFITi has been tested on 3 real-world and 1\nsynthetic irregularly sampled time series dataset with missing values and\ncompared with various state-of-the-art models. The experimental results\ndemonstrate that GraFITi improves the forecasting accuracy by up to 17% and\nreduces the run time up to 5 times compared to the state-of-the-art forecasting\nmodels.\n","authors":["Vijaya Krishna Yalavarthi","Kiran Madhusudhanan","Randolf Sholz","Nourhan Ahmed","Johannes Burchert","Shayan Jawed","Stefan Born","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2305.12932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00790v4","updated":"2023-08-10T14:26:00Z","published":"2022-12-30T17:19:00Z","title":"Online learning techniques for prediction of temporal tabular datasets\n  with regime changes","summary":"  The application of deep learning to non-stationary temporal datasets can lead\nto overfitted models that underperform under regime changes. In this work, we\npropose a modular machine learning pipeline for ranking predictions on temporal\npanel datasets which is robust under regime changes. The modularity of the\npipeline allows the use of different models, including Gradient Boosting\nDecision Trees (GBDTs) and Neural Networks, with and without feature\nengineering. We evaluate our framework on financial data for stock portfolio\nprediction, and find that GBDT models with dropout display high performance,\nrobustness and generalisability with reduced complexity and computational cost.\nWe then demonstrate how online learning techniques, which require no retraining\nof models, can be used post-prediction to enhance the results. First, we show\nthat dynamic feature projection improves robustness by reducing drawdown in\nregime changes. Second, we demonstrate that dynamical model ensembling based on\nselection of models with good recent performance leads to improved Sharpe and\nCalmar ratios of out-of-sample predictions. We also evaluate the robustness of\nour pipeline across different data splits and random seeds with good\nreproducibility.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2301.00790v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05601v1","updated":"2023-08-10T14:20:43Z","published":"2023-08-10T14:20:43Z","title":"Multi-graph Spatio-temporal Graph Convolutional Network for Traffic Flow\n  Prediction","summary":"  Inter-city highway transportation is significant for urban life. As one of\nthe key functions in intelligent transportation system (ITS), traffic\nevaluation always plays significant role nowadays, and daily traffic flow\nprediction still faces challenges at network-wide toll stations. On the one\nhand, the data imbalance in practice among various locations deteriorates the\nperformance of prediction. On the other hand, complex correlative\nspatio-temporal factors cannot be comprehensively employed in long-term\nduration. In this paper, a prediction method is proposed for daily traffic flow\nin highway domain through spatio-temporal deep learning. In our method, data\nnormalization strategy is used to deal with data imbalance, due to long-tail\ndistribution of traffic flow at network-wide toll stations. And then, based on\ngraph convolutional network, we construct networks in distinct semantics to\ncapture spatio-temporal features. Beside that, meteorology and calendar\nfeatures are used by our model in the full connection stage to extra external\ncharacteristics of traffic flow. By extensive experiments and case studies in\none Chinese provincial highway, our method shows clear improvement in\npredictive accuracy than baselines and practical benefits in business.\n","authors":["Weilong Ding","Tianpu Zhang","Jianwu Wang","Zhuofeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05600v1","updated":"2023-08-10T14:19:58Z","published":"2023-08-10T14:19:58Z","title":"NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search","summary":"  Deep neural network (DNN) deployment has been confined to larger hardware\ndevices due to their expensive computational requirements. This challenge has\nrecently reached another scale with the emergence of large language models\n(LLMs). In order to reduce both their memory footprint and latency, a promising\ntechnique is quantization. It consists in converting floating point\nrepresentations to low bit-width fixed point representations, usually by\nassuming a uniform mapping onto a regular grid. This process, referred to in\nthe literature as uniform quantization, may however be ill-suited as most DNN\nweights and activations follow a bell-shaped distribution. This is even worse\non LLMs whose weight distributions are known to exhibit large, high impact,\noutlier values. In this work, we propose an improvement over the most commonly\nadopted way to tackle this limitation in deep learning models quantization,\nnamely, non-uniform quantization. NUPES leverages automorphisms to preserve the\nscalar multiplications. Such transformations are derived from power functions.\nHowever, the optimization of the exponent parameter and weight values remains a\nchallenging and novel problem which could not be solved with previous post\ntraining optimization techniques which only learn to round up or down weight\nvalues in order to preserve the predictive function. We circumvent this\nlimitation with a new paradigm: learning new quantized weights over the entire\nquantized space. Similarly, we enable the optimization of the power exponent,\ni.e. the optimization of the quantization operator itself during training by\nalleviating all the numerical instabilities. The resulting predictive function\nis compatible with integer-only low-bit inference. We show the ability of the\nmethod to achieve state-of-the-art compression rates in both, data-free and\ndata-driven configurations.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.05600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05011v2","updated":"2023-08-10T14:08:26Z","published":"2023-08-09T15:10:53Z","title":"Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with\n  Distinct Inlier Categories","summary":"  With the increasing volume of astronomical data generated by modern survey\ntelescopes, automated pipelines and machine learning techniques have become\ncrucial for analyzing and extracting knowledge from these datasets. Anomaly\ndetection, i.e. the task of identifying irregular or unexpected patterns in the\ndata, is a complex challenge in astronomy. In this paper, we propose\nMulti-Class Deep Support Vector Data Description (MCDSVDD), an extension of the\nstate-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically\ndesigned to handle different inlier categories with distinct data\ndistributions. MCDSVDD uses a neural network to map the data into hyperspheres,\nwhere each hypersphere represents a specific inlier category. The distance of\neach sample from the centers of these hyperspheres determines the anomaly\nscore. We evaluate the effectiveness of MCDSVDD by comparing its performance\nwith several anomaly detection algorithms on a large dataset of astronomical\nlight-curves obtained from the Zwicky Transient Facility. Our results\ndemonstrate the efficacy of MCDSVDD in detecting anomalous sources while\nleveraging the presence of different inlier categories. The code and the data\nneeded to reproduce our results are publicly available at\nhttps://github.com/mperezcarrasco/AnomalyALeRCE.\n","authors":["Manuel Pérez-Carrasco","Guillermo Cabrera-Vives","Lorena Hernández-García","Francisco Forster","Paula Sánchez-Sáez","Alejandra Muñoz Arancibia","Nicolás Astorga","Franz Bauer","Amelia Bayo","Martina Cádiz-Leyton","Marcio Catelan"],"pdf_url":"https://arxiv.org/pdf/2308.05011v2.pdf","comment":"Accepted to ICML 2023 Workshop on Machine Learning for Astrophysics"},{"id":"http://arxiv.org/abs/2308.05575v1","updated":"2023-08-10T13:39:19Z","published":"2023-08-10T13:39:19Z","title":"Symmetry Defense Against XGBoost Adversarial Perturbation Attacks","summary":"  We examine whether symmetry can be used to defend tree-based ensemble\nclassifiers such as gradient-boosting decision trees (GBDTs) against\nadversarial perturbation attacks. The idea is based on a recent symmetry\ndefense for convolutional neural network classifiers (CNNs) that utilizes CNNs'\nlack of invariance with respect to symmetries. CNNs lack invariance because\nthey can classify a symmetric sample, such as a horizontally flipped image,\ndifferently from the original sample. CNNs' lack of invariance also means that\nCNNs can classify symmetric adversarial samples differently from the incorrect\nclassification of adversarial samples. Using CNNs' lack of invariance, the\nrecent CNN symmetry defense has shown that the classification of symmetric\nadversarial samples reverts to the correct sample classification. In order to\napply the same symmetry defense to GBDTs, we examine GBDT invariance and are\nthe first to show that GBDTs also lack invariance with respect to symmetries.\nWe apply and evaluate the GBDT symmetry defense for nine datasets against six\nperturbation attacks with a threat model that ranges from zero-knowledge to\nperfect-knowledge adversaries. Using the feature inversion symmetry against\nzero-knowledge adversaries, we achieve up to 100% accuracy on adversarial\nsamples even when default and robust classifiers have 0% accuracy. Using the\nfeature inversion and horizontal flip symmetries against perfect-knowledge\nadversaries, we achieve up to over 95% accuracy on adversarial samples for the\nGBDT classifier of the F-MNIST dataset even when default and robust classifiers\nhave 0% accuracy.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2308.05575v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2303.07925v7","updated":"2023-08-10T13:29:37Z","published":"2023-03-14T14:10:37Z","title":"Deep incremental learning models for financial temporal tabular datasets\n  with distribution shifts","summary":"  We present a robust deep incremental learning framework for regression tasks\non financial temporal tabular datasets which is built upon the incremental use\nof commonly available tabular and time series prediction models to adapt to\ndistributional shifts typical of financial datasets. The framework uses a\nsimple basic building block (decision trees) to build self-similar models of\nany required complexity to deliver robust performance under adverse situations\nsuch as regime changes, fat-tailed distributions, and low signal-to-noise\nratios. As a detailed study, we demonstrate our scheme using XGBoost models\ntrained on the Numerai dataset and show that a two layer deep ensemble of\nXGBoost models over different model snapshots delivers high quality predictions\nunder different market regimes. We also show that the performance of XGBoost\nmodels with different number of boosting rounds in three scenarios (small,\nstandard and large) is monotonically increasing with respect to model size and\nconverges towards the generalisation upper bound. We also evaluate the\nrobustness of the model under variability of different hyperparameters, such as\nmodel complexity and data sampling settings. Our model has low hardware\nrequirements as no specialised neural architectures are used and each base\nmodel can be independently trained in parallel.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2303.07925v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05566v1","updated":"2023-08-10T13:28:59Z","published":"2023-08-10T13:28:59Z","title":"AutoGluon-TimeSeries: AutoML for Probabilistic Time Series Forecasting","summary":"  We introduce AutoGluon-TimeSeries - an open-source AutoML library for\nprobabilistic time series forecasting. Focused on ease of use and robustness,\nAutoGluon-TimeSeries enables users to generate accurate point and quantile\nforecasts with just 3 lines of Python code. Built on the design philosophy of\nAutoGluon, AutoGluon-TimeSeries leverages ensembles of diverse forecasting\nmodels to deliver high accuracy within a short training time.\nAutoGluon-TimeSeries combines both conventional statistical models,\nmachine-learning based forecasting approaches, and ensembling techniques. In\nour evaluation on 29 benchmark datasets, AutoGluon-TimeSeries demonstrates\nstrong empirical performance, outperforming a range of forecasting methods in\nterms of both point and quantile forecast accuracy, and often even improving\nupon the best-in-hindsight combination of prior methods.\n","authors":["Oleksandr Shchur","Caner Turkmen","Nick Erickson","Huibin Shen","Alexander Shirkov","Tony Hu","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05566v1.pdf","comment":"Published at AutoML Conference 2023"},{"id":"http://arxiv.org/abs/2308.05564v1","updated":"2023-08-10T13:24:45Z","published":"2023-08-10T13:24:45Z","title":"Efficient Variational Inference for Large Skew-t Copulas with\n  Application to Intraday Equity Returns","summary":"  Large skew-t factor copula models are attractive for the modeling of\nfinancial data because they allow for asymmetric and extreme tail dependence.\nWe show that the copula implicit in the skew-t distribution of Azzalini and\nCapitanio (2003) allows for a higher level of pairwise asymmetric dependence\nthan two popular alternative skew-t copulas. Estimation of this copula in high\ndimensions is challenging, and we propose a fast and accurate Bayesian\nvariational inference (VI) approach to do so. The method uses a conditionally\nGaussian generative representation of the skew-t distribution to define an\naugmented posterior that can be approximated accurately. A fast stochastic\ngradient ascent algorithm is used to solve the variational optimization. The\nnew methodology is used to estimate copula models for intraday returns from\n2017 to 2021 on 93 U.S. equities. The copula captures substantial heterogeneity\nin asymmetric dependence over equity pairs, in addition to the variability in\npairwise correlations. We show that intraday predictive densities from the\nskew-t copula are more accurate than from some other copula models, while\nportfolio selection strategies based on the estimated pairwise tail\ndependencies improve performance relative to the benchmark index.\n","authors":["Lin Deng","Michael Stanley Smith","Worapree Maneesoonthorn"],"pdf_url":"https://arxiv.org/pdf/2308.05564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06024v3","updated":"2023-08-10T13:03:03Z","published":"2023-03-10T16:23:56Z","title":"A hybrid deep-learning-metaheuristic framework for bi-level network\n  design problems","summary":"  This study proposes a hybrid deep-learning-metaheuristic framework with a\nbi-level architecture for road network design problems (NDPs). We train a graph\nneural network (GNN) to approximate the solution of the user equilibrium (UE)\ntraffic assignment problem and use inferences made by the trained model to\ncalculate fitness function evaluations of a genetic algorithm (GA) to\napproximate solutions for NDPs. Using three test networks, two NDP variants and\nan exact solver as benchmark, we show that on average, our proposed framework\ncan provide solutions within 1.5% gap of the best results in less than 0.5% of\nthe time used by the exact solution procedure. Our framework can be utilized\nwithin an expert system for infrastructure planning to determine the best\ninfrastructure planning and management decisions under different scenarios.\nGiven the flexibility of the framework, it can easily be adapted to many other\ndecision problems that can be modeled as bi-level problems on graphs. Moreover,\nwe foreseen interesting future research directions, thus we also put forward a\nbrief research agenda for this topic. The key observation from our research\nthat can shape future research is that the fitness function evaluation time\nusing the inferences made by the GNN model was in the order of milliseconds,\nwhich points to an opportunity and a need for novel heuristics that 1) can cope\nwell with noisy fitness function values provided by deep learning models, and\n2) can use the significantly enlarged efficiency of the evaluation step to\nexplore the search space effectively (rather than efficiently). This opens a\nnew avenue for a modern class of metaheuristics that are crafted for use with\nAI-powered predictors.\n","authors":["Bahman Madadi","Goncalo Homem de Almeida Correia"],"pdf_url":"https://arxiv.org/pdf/2303.06024v3.pdf","comment":"Two case studies added, intro, discussion and conclusion extended,\n  details added to method and experiments, typos fixed, title revised,\n  references added"},{"id":"http://arxiv.org/abs/2210.04087v3","updated":"2023-08-10T12:42:06Z","published":"2022-10-08T18:49:58Z","title":"Symmetry Defense Against CNN Adversarial Perturbation Attacks","summary":"  This paper uses symmetry to make Convolutional Neural Network classifiers\n(CNNs) robust against adversarial perturbation attacks. Such attacks add\nperturbation to original images to generate adversarial images that fool\nclassifiers such as road sign classifiers of autonomous vehicles. Although\nsymmetry is a pervasive aspect of the natural world, CNNs are unable to handle\nsymmetry well. For example, a CNN can classify an image differently from its\nmirror image. For an adversarial image that misclassifies with a wrong label\n$l_w$, CNN inability to handle symmetry means that a symmetric adversarial\nimage can classify differently from the wrong label $l_w$. Further than that,\nwe find that the classification of a symmetric adversarial image reverts to the\ncorrect label. To classify an image when adversaries are unaware of the\ndefense, we apply symmetry to the image and use the classification label of the\nsymmetric image. To classify an image when adversaries are aware of the\ndefense, we use mirror symmetry and pixel inversion symmetry to form a symmetry\ngroup. We apply all the group symmetries to the image and decide on the output\nlabel based on the agreement of any two of the classification labels of the\nsymmetry images. Adaptive attacks fail because they need to rely on loss\nfunctions that use conflicting CNN output values for symmetric images. Without\nattack knowledge, the proposed symmetry defense succeeds against both\ngradient-based and random-search attacks, with up to near-default accuracies\nfor ImageNet. The defense even improves the classification accuracy of original\nimages.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2210.04087v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2301.05869v2","updated":"2023-08-10T12:35:11Z","published":"2023-01-14T09:41:21Z","title":"Functional Neural Networks: Shift invariant models for functional data\n  with applications to EEG classification","summary":"  It is desirable for statistical models to detect signals of interest\nindependently of their position. If the data is generated by some smooth\nprocess, this additional structure should be taken into account. We introduce a\nnew class of neural networks that are shift invariant and preserve smoothness\nof the data: functional neural networks (FNNs). For this, we use methods from\nfunctional data analysis (FDA) to extend multi-layer perceptrons and\nconvolutional neural networks to functional data. We propose different model\narchitectures, show that the models outperform a benchmark model from FDA in\nterms of accuracy and successfully use FNNs to classify electroencephalography\n(EEG) data.\n","authors":["Florian Heinrichs","Mavin Heim","Corinna Weber"],"pdf_url":"https://arxiv.org/pdf/2301.05869v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.19170v2","updated":"2023-08-10T12:26:00Z","published":"2023-05-30T16:15:57Z","title":"Forward-Forward Training of an Optical Neural Network","summary":"  Neural networks (NN) have demonstrated remarkable capabilities in various\ntasks, but their computation-intensive nature demands faster and more\nenergy-efficient hardware implementations. Optics-based platforms, using\ntechnologies such as silicon photonics and spatial light modulators, offer\npromising avenues for achieving this goal. However, training multiple trainable\nlayers in tandem with these physical systems poses challenges, as they are\ndifficult to fully characterize and describe with differentiable functions,\nhindering the use of error backpropagation algorithm. The recently introduced\nForward-Forward Algorithm (FFA) eliminates the need for perfect\ncharacterization of the learning system and shows promise for efficient\ntraining with large numbers of programmable parameters. The FFA does not\nrequire backpropagating an error signal to update the weights, rather the\nweights are updated by only sending information in one direction. The local\nloss function for each set of trainable weights enables low-power analog\nhardware implementations without resorting to metaheuristic algorithms or\nreinforcement learning. In this paper, we present an experiment utilizing\nmultimode nonlinear wave propagation in an optical fiber demonstrating the\nfeasibility of the FFA approach using an optical system. The results show that\nincorporating optical transforms in multilayer NN architectures trained with\nthe FFA, can lead to performance improvements, even with a relatively small\nnumber of trainable weights. The proposed method offers a new path to the\nchallenge of training optical NNs and provides insights into leveraging\nphysical transformations for enhancing NN performance.\n","authors":["Ilker Oguz","Junjie Ke","Qifei Wang","Feng Yang","Mustafa Yildirim","Niyazi Ulas Dinc","Jih-Liang Hsieh","Christophe Moser","Demetri Psaltis"],"pdf_url":"https://arxiv.org/pdf/2305.19170v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10382v4","updated":"2023-08-10T12:21:41Z","published":"2023-04-19T17:02:28Z","title":"Conditional Generative Models for Learning Stochastic Processes","summary":"  A framework to learn a multi-modal distribution is proposed, denoted as the\nConditional Quantum Generative Adversarial Network (C-qGAN). The neural network\nstructure is strictly within a quantum circuit and, as a consequence, is shown\nto represent a more efficient state preparation procedure than current methods.\nThis methodology has the potential to speed-up algorithms, such as Monte Carlo\nanalysis. In particular, after demonstrating the effectiveness of the network\nin the learning task, the technique is applied to price Asian option\nderivatives, providing the foundation for further research on other\npath-dependent options.\n","authors":["Salvatore Certo","Anh Pham","Nicolas Robles","Andrew Vlasic"],"pdf_url":"https://arxiv.org/pdf/2304.10382v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05525v1","updated":"2023-08-10T12:06:03Z","published":"2023-08-10T12:06:03Z","title":"Critical Points ++: An Agile Point Cloud Importance Measure for Robust\n  Classification, Adversarial Defense and Explainable AI","summary":"  The ability to cope accurately and fast with Out-Of-Distribution (OOD)\nsamples is crucial in real-world safety demanding applications. In this work we\nfirst study the interplay between critical points of 3D point clouds and OOD\nsamples. Our findings are that common corruptions and outliers are often\ninterpreted as critical points. We generalize the notion of critical points\ninto importance measures. We show that training a classification network based\nonly on less important points dramatically improves robustness, at a cost of\nminor performance loss on the clean set. We observe that normalized entropy is\nhighly informative for corruption analysis. An adaptive threshold based on\nnormalized entropy is suggested for selecting the set of uncritical points. Our\nproposed importance measure is extremely fast to compute. We show it can be\nused for a variety of applications, such as Explainable AI (XAI), Outlier\nRemoval, Uncertainty Estimation, Robust Classification and Adversarial Defense.\nWe reach SOTA results on the two latter tasks.\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2308.05525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05522v1","updated":"2023-08-10T12:04:47Z","published":"2023-08-10T12:04:47Z","title":"Models Matter: The Impact of Single-Step Retrosynthesis on Synthesis\n  Planning","summary":"  Retrosynthesis consists of breaking down a chemical compound recursively\nstep-by-step into molecular precursors until a set of commercially available\nmolecules is found with the goal to provide a synthesis route. Its two primary\nresearch directions, single-step retrosynthesis prediction, which models the\nchemical reaction logic, and multi-step synthesis planning, which tries to find\nthe correct sequence of reactions, are inherently intertwined. Still, this\nconnection is not reflected in contemporary research. In this work, we combine\nthese two major research directions by applying multiple single-step\nretrosynthesis models within multi-step synthesis planning and analyzing their\nimpact using public and proprietary reaction data. We find a disconnection\nbetween high single-step performance and potential route-finding success,\nsuggesting that single-step models must be evaluated within synthesis planning\nin the future. Furthermore, we show that the commonly used single-step\nretrosynthesis benchmark dataset USPTO-50k is insufficient as this evaluation\ntask does not represent model performance and scalability on larger and more\ndiverse datasets. For multi-step synthesis planning, we show that the choice of\nthe single-step model can improve the overall success rate of synthesis\nplanning by up to +28% compared to the commonly used baseline model. Finally,\nwe show that each single-step model finds unique synthesis routes, and differs\nin aspects such as route-finding success, the number of found synthesis routes,\nand chemical validity, making the combination of single-step retrosynthesis\nprediction and multi-step synthesis planning a crucial aspect when developing\nfuture methods.\n","authors":["Paula Torren-Peraire","Alan Kai Hassen","Samuel Genheden","Jonas Verhoeven","Djork-Arne Clevert","Mike Preuss","Igor Tetko"],"pdf_url":"https://arxiv.org/pdf/2308.05522v1.pdf","comment":"The following authors contributed equally: Paula Torren-Peraire, Alan\n  Kai Hassen"},{"id":"http://arxiv.org/abs/2308.05509v1","updated":"2023-08-10T11:42:09Z","published":"2023-08-10T11:42:09Z","title":"On the Optimal Expressive Power of ReLU DNNs and Its Application in\n  Approximation with Kolmogorov Superposition Theorem","summary":"  This paper is devoted to studying the optimal expressive power of ReLU deep\nneural networks (DNNs) and its application in approximation via the Kolmogorov\nSuperposition Theorem. We first constructively prove that any continuous\npiecewise linear functions on $[0,1]$, comprising $O(N^2L)$ segments, can be\nrepresented by ReLU DNNs with $L$ hidden layers and $N$ neurons per layer.\nSubsequently, we demonstrate that this construction is optimal regarding the\nparameter count of the DNNs, achieved through investigating the shattering\ncapacity of ReLU DNNs. Moreover, by invoking the Kolmogorov Superposition\nTheorem, we achieve an enhanced approximation rate for ReLU DNNs of arbitrary\nwidth and depth when dealing with continuous functions in high-dimensional\nspaces.\n","authors":["Juncai He"],"pdf_url":"https://arxiv.org/pdf/2308.05509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07944v2","updated":"2023-08-10T11:05:23Z","published":"2023-07-16T04:34:11Z","title":"Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and\n  Class-balanced Pseudo-Labeling","summary":"  Unsupervised domain adaptation (DA) with the aid of pseudo labeling\ntechniques has emerged as a crucial approach for domain-adaptive 3D object\ndetection. While effective, existing DA methods suffer from a substantial drop\nin performance when applied to a multi-class training setting, due to the\nco-existence of low-quality pseudo labels and class imbalance issues. In this\npaper, we address this challenge by proposing a novel ReDB framework tailored\nfor learning to detect all classes at once. Our approach produces Reliable,\nDiverse, and class-Balanced pseudo 3D boxes to iteratively guide the\nself-training on a distributionally different target domain. To alleviate\ndisruptions caused by the environmental discrepancy (e.g., beam numbers), the\nproposed cross-domain examination (CDE) assesses the correctness of pseudo\nlabels by copy-pasting target instances into a source environment and measuring\nthe prediction consistency. To reduce computational overhead and mitigate the\nobject shift (e.g., scales and point densities), we design an overlapped boxes\ncounting (OBC) metric that allows to uniformly downsample pseudo-labeled\nobjects across different geometric characteristics. To confront the issue of\ninter-class imbalance, we progressively augment the target point clouds with a\nclass-balanced set of pseudo-labeled target instances and source objects, which\nboosts recognition accuracies on both frequently appearing and rare classes.\nExperimental results on three benchmark datasets using both voxel-based (i.e.,\nSECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our\nproposed ReDB approach outperforms existing 3D domain adaptation methods by a\nlarge margin, improving 23.15% mAP on the nuScenes $\\rightarrow$ KITTI task.\nThe code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.\n","authors":["Zhuoxiao Chen","Yadan Luo","Zheng Wang","Mahsa Baktashmotlagh","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07944v2.pdf","comment":"Accepted by ICCV 2023, camera-ready"},{"id":"http://arxiv.org/abs/2308.05483v1","updated":"2023-08-10T10:19:48Z","published":"2023-08-10T10:19:48Z","title":"Quality Diversity under Sparse Reward and Sparse Interaction:\n  Application to Grasping in Robotics","summary":"  Quality-Diversity (QD) methods are algorithms that aim to generate a set of\ndiverse and high-performing solutions to a given problem. Originally developed\nfor evolutionary robotics, most QD studies are conducted on a limited set of\ndomains - mainly applied to locomotion, where the fitness and the behavior\nsignal are dense. Grasping is a crucial task for manipulation in robotics.\nDespite the efforts of many research communities, this task is yet to be\nsolved. Grasping cumulates unprecedented challenges in QD literature: it\nsuffers from reward sparsity, behavioral sparsity, and behavior space\nmisalignment. The present work studies how QD can address grasping. Experiments\nhave been conducted on 15 different methods on 10 grasping domains,\ncorresponding to 2 different robot-gripper setups and 5 standard objects. An\nevaluation framework that distinguishes the evaluation of an algorithm from its\ninternal components has also been proposed for a fair comparison. The obtained\nresults show that MAP-Elites variants that select successful solutions in\npriority outperform all the compared methods on the studied metrics by a large\nmargin. We also found experimental evidence that sparse interaction can lead to\ndeceptive novelty. To our knowledge, the ability to efficiently produce\nexamples of grasping trajectories demonstrated in this work has no precedent in\nthe literature.\n","authors":["J. Huber","F. Hélénon","M. Coninx","F. Ben Amar","S. Doncieux"],"pdf_url":"https://arxiv.org/pdf/2308.05483v1.pdf","comment":"37 pages, 17 figures. Draft version"},{"id":"http://arxiv.org/abs/2308.05481v1","updated":"2023-08-10T10:12:43Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":"  Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v1","updated":"2023-08-10T10:07:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n  Deceptive Text Classification: A Comparative Analysis","summary":"  Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive or fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05471v1","updated":"2023-08-10T09:52:44Z","published":"2023-08-10T09:52:44Z","title":"Provably Efficient Algorithm for Nonstationary Low-Rank MDPs","summary":"  Reinforcement learning (RL) under changing environment models many real-world\napplications via nonstationary Markov Decision Processes (MDPs), and hence\ngains considerable interest. However, theoretical studies on nonstationary MDPs\nin the literature have mainly focused on tabular and linear (mixture) MDPs,\nwhich do not capture the nature of unknown representation in deep RL. In this\npaper, we make the first effort to investigate nonstationary RL under episodic\nlow-rank MDPs, where both transition kernels and rewards may vary over time,\nand the low-rank model contains unknown representation in addition to the\nlinear state embedding function. We first propose a parameter-dependent policy\noptimization algorithm called PORTAL, and further improve PORTAL to its\nparameter-free version of Ada-PORTAL, which is able to tune its\nhyper-parameters adaptively without any prior knowledge of nonstationarity. For\nboth algorithms, we provide upper bounds on the average dynamic suboptimality\ngap, which show that as long as the nonstationarity is not significantly large,\nPORTAL and Ada-PORTAL are sample-efficient and can achieve arbitrarily small\naverage dynamic suboptimality gap with polynomial sample complexity.\n","authors":["Yuan Cheng","Jing Yang","Yingbin Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05463v1","updated":"2023-08-10T09:42:20Z","published":"2023-08-10T09:42:20Z","title":"$\\mathcal{G}^2Pxy$: Generative Open-Set Node Classification on Graphs\n  with Proxy Unknowns","summary":"  Node classification is the task of predicting the labels of unlabeled nodes\nin a graph. State-of-the-art methods based on graph neural networks achieve\nexcellent performance when all labels are available during training. But in\nreal-life, models are often applied on data with new classes, which can lead to\nmassive misclassification and thus significantly degrade performance. Hence,\ndeveloping open-set classification methods is crucial to determine if a given\nsample belongs to a known class. Existing methods for open-set node\nclassification generally use transductive learning with part or all of the\nfeatures of real unseen class nodes to help with open-set classification. In\nthis paper, we propose a novel generative open-set node classification method,\ni.e. $\\mathcal{G}^2Pxy$, which follows a stricter inductive learning setting\nwhere no information about unknown classes is available during training and\nvalidation. Two kinds of proxy unknown nodes, inter-class unknown proxies and\nexternal unknown proxies are generated via mixup to efficiently anticipate the\ndistribution of novel classes. Using the generated proxies, a closed-set\nclassifier can be transformed into an open-set one, by augmenting it with an\nextra proxy classifier. Under the constraints of both cross entropy loss and\ncomplement entropy loss, $\\mathcal{G}^2Pxy$ achieves superior effectiveness for\nunknown class detection and known class classification, which is validated by\nexperiments on benchmark graph datasets. Moreover, $\\mathcal{G}^2Pxy$ does not\nhave specific requirement on the GNN architecture and shows good\ngeneralizations.\n","authors":["Qin Zhang","Zelin Shi","Xiaolin Zhang","Xiaojun Chen","Philippe Fournier-Viger","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2308.05463v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.05451v1","updated":"2023-08-10T09:17:07Z","published":"2023-08-10T09:17:07Z","title":"A Forecaster's Review of Judea Pearl's Causality: Models, Reasoning and\n  Inference, Second Edition, 2009","summary":"  With the big popularity and success of Judea Pearl's original causality book,\nthis review covers the main topics updated in the second edition in 2009 and\nillustrates an easy-to-follow causal inference strategy in a forecast scenario.\nIt further discusses some potential benefits and challenges for causal\ninference with time series forecasting when modeling the counterfactuals,\nestimating the uncertainty and incorporating prior knowledge to estimate causal\neffects in different forecasting scenarios.\n","authors":["Feng Li"],"pdf_url":"https://arxiv.org/pdf/2308.05451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06955v2","updated":"2023-08-10T08:35:56Z","published":"2023-06-12T08:37:38Z","title":"A Brief Review of Hypernetworks in Deep Learning","summary":"  Hypernetworks, or hypernets in short, are neural networks that generate\nweights for another neural network, known as the target network. They have\nemerged as a powerful deep learning technique that allows for greater\nflexibility, adaptability, dynamism, faster training, information sharing, and\nmodel compression etc. Hypernets have shown promising results in a variety of\ndeep learning problems, including continual learning, causal inference,\ntransfer learning, weight pruning, uncertainty quantification, zero-shot\nlearning, natural language processing, and reinforcement learning etc. Despite\ntheir success across different problem settings, currently, there is no review\navailable to inform the researchers about the developments and to help in\nutilizing hypernets. To fill this gap, we review the progress in hypernets. We\npresent an illustrative example to train deep neural networks using hypernets\nand propose categorizing hypernets based on five design criteria as inputs,\noutputs, variability of inputs and outputs, and architecture of hypernets. We\nalso review applications of hypernets across different deep learning problem\nsettings, followed by a discussion of general scenarios where hypernets can be\neffectively employed. Finally, we discuss the challenges and future directions\nthat remain under-explored in the field of hypernets. We believe that\nhypernetworks have the potential to revolutionize the field of deep learning.\nThey offer a new way to design and train neural networks, and they have the\npotential to improve the performance of deep learning models on a variety of\ntasks. Through this review, we aim to inspire further advancements in deep\nlearning through hypernetworks.\n","authors":["Vinod Kumar Chauhan","Jiandong Zhou","Ping Lu","Soheila Molaei","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2306.06955v2.pdf","comment":"revised categorisation, added new Section '5 When can we use\n  Hypernets?', and other corrections(2 figures and 2 tables) (under review)"},{"id":"http://arxiv.org/abs/2107.02495v3","updated":"2023-08-10T08:16:52Z","published":"2021-07-06T09:24:57Z","title":"InfoNCE is variational inference in a recognition parameterised model","summary":"  Here, we show that the InfoNCE objective is equivalent to the ELBO in a new\nclass of probabilistic generative model, the recognition parameterised model\n(RPM). When we learn the optimal prior, the RPM ELBO becomes equal to the\nmutual information (MI; up to a constant), establishing a connection to\npre-existing self-supervised learning methods such as InfoNCE. However,\npractical InfoNCE methods do not use the MI as an objective; the MI is\ninvariant to arbitrary invertible transformations, so using an MI objective can\nlead to highly entangled representations (Tschannen et al., 2019). Instead, the\nactual InfoNCE objective is a simplified lower bound on the MI which is loose\neven in the infinite sample limit. Thus, an objective that works (i.e. the\nactual InfoNCE objective) appears to be motivated as a loose bound on an\nobjective that does not work (i.e. the true MI which gives arbitrarily\nentangled representations). We give an alternative motivation for the actual\nInfoNCE objective. In particular, we show that in the infinite sample limit,\nand for a particular choice of prior, the actual InfoNCE objective is equal to\nthe ELBO (up to a constant); and the ELBO is equal to the marginal likelihood\nwith a deterministic recognition model. Thus, we argue that our VAE perspective\ngives a better motivation for InfoNCE than MI, as the actual InfoNCE objective\nis only loosely bounded by the MI, but is equal to the ELBO/marginal likelihood\n(up to a constant).\n","authors":["Laurence Aitchison","Stoil Ganev"],"pdf_url":"https://arxiv.org/pdf/2107.02495v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09738v7","updated":"2023-08-10T08:12:39Z","published":"2023-02-20T03:31:11Z","title":"Simplifying Momentum-based Positive-definite Submanifold Optimization\n  with Applications to Deep Learning","summary":"  Riemannian submanifold optimization with momentum is computationally\nchallenging because, to ensure that the iterates remain on the submanifold, we\noften need to solve difficult differential equations. Here, we simplify such\ndifficulties for a class of sparse or structured symmetric positive-definite\nmatrices with the affine-invariant metric. We do so by proposing a generalized\nversion of the Riemannian normal coordinates that dynamically orthonormalizes\nthe metric and locally converts the problem into an unconstrained problem in\nthe Euclidean space. We use our approach to simplify existing approaches for\nstructured covariances and develop matrix-inverse-free $2^\\text{nd}$-order\noptimizers for deep learning with low precision by using only matrix\nmultiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL\n","authors":["Wu Lin","Valentin Duruisseaux","Melvin Leok","Frank Nielsen","Mohammad Emtiyaz Khan","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2302.09738v7.pdf","comment":"An updated version of the ICML 2023 paper. Updated the main text to\n  emphasize challenges of using existing Riemannian methods to estimate sparse\n  and structured SPD matrices"},{"id":"http://arxiv.org/abs/2308.05411v1","updated":"2023-08-10T08:12:17Z","published":"2023-08-10T08:12:17Z","title":"Explainable AI applications in the Medical Domain: a systematic review","summary":"  Artificial Intelligence in Medicine has made significant progress with\nemerging applications in medical imaging, patient care, and other areas. While\nthese applications have proven successful in retrospective studies, very few of\nthem were applied in practice.The field of Medical AI faces various challenges,\nin terms of building user trust, complying with regulations, using data\nethically.Explainable AI (XAI) aims to enable humans understand AI and trust\nits results. This paper presents a literature review on the recent developments\nof XAI solutions for medical decision support, based on a representative sample\nof 198 articles published in recent years. The systematic synthesis of the\nrelevant articles resulted in several findings. (1) model-agnostic XAI\ntechniques were mostly employed in these solutions, (2) deep learning models\nare utilized more than other types of machine learning models, (3)\nexplainability was applied to promote trust, but very few works reported the\nphysicians participation in the loop, (4) visual and interactive user interface\nis more useful in understanding the explanation and the recommendation of the\nsystem. More research is needed in collaboration between medical and AI\nexperts, that could guide the development of suitable frameworks for the\ndesign, implementation, and evaluation of XAI solutions in medicine.\n","authors":["Nicoletta Prentzas","Antonis Kakas","Constantinos S. Pattichis"],"pdf_url":"https://arxiv.org/pdf/2308.05411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05407v1","updated":"2023-08-10T08:03:58Z","published":"2023-08-10T08:03:58Z","title":"A Comparative Assessment of Multi-view fusion learning for Crop\n  Classification","summary":"  With a rapidly increasing amount and diversity of remote sensing (RS) data\nsources, there is a strong need for multi-view learning modeling. This is a\ncomplex task when considering the differences in resolution, magnitude, and\nnoise of RS data. The typical approach for merging multiple RS sources has been\ninput-level fusion, but other - more advanced - fusion strategies may\noutperform this traditional approach. This work assesses different fusion\nstrategies for crop classification in the CropHarvest dataset. The fusion\nmethods proposed in this work outperform models based on individual views and\nprevious fusion methods. We do not find one single fusion method that\nconsistently outperforms all other approaches. Instead, we present a comparison\nof multi-view fusion methods for three different datasets and show that,\ndepending on the test region, different methods obtain the best performance.\nDespite this, we suggest a preliminary criterion for the selection of fusion\nmethods.\n","authors":["Francisco Mena","Diego Arenas","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.05407v1.pdf","comment":"Accepted at IEEE International Geoscience and Remote Sensing\n  Symposium 2023"},{"id":"http://arxiv.org/abs/2304.12177v2","updated":"2023-08-10T07:59:33Z","published":"2023-04-24T15:38:22Z","title":"Π-ML: A dimensional analysis-based machine learning parameterization\n  of optical turbulence in the atmospheric surface layer","summary":"  Turbulent fluctuations of the atmospheric refraction index, so-called optical\nturbulence, can significantly distort propagating laser beams. Therefore,\nmodeling the strength of these fluctuations ($C_n^2$) is highly relevant for\nthe successful development and deployment of future free-space optical\ncommunication links. In this letter, we propose a physics-informed machine\nlearning (ML) methodology, $\\Pi$-ML, based on dimensional analysis and gradient\nboosting to estimate $C_n^2$. Through a systematic feature importance analysis,\nwe identify the normalized variance of potential temperature as the dominating\nfeature for predicting $C_n^2$. For statistical robustness, we train an\nensemble of models which yields high performance on the out-of-sample data of\n$R^2=0.958\\pm0.001$.\n","authors":["Maximilian Pierzyna","Rudolf Saathof","Sukanta Basu"],"pdf_url":"https://arxiv.org/pdf/2304.12177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v3","updated":"2023-08-10T07:44:19Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n  Learner Equation Modeling","summary":"  Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v2","updated":"2023-08-10T07:38:35Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v2.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":"  In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n  eCom'22)"},{"id":"http://arxiv.org/abs/2303.06601v2","updated":"2023-08-10T06:53:11Z","published":"2023-03-12T08:05:30Z","title":"Multi-metrics adaptively identifies backdoors in Federated learning","summary":"  The decentralized and privacy-preserving nature of federated learning (FL)\nmakes it vulnerable to backdoor attacks aiming to manipulate the behavior of\nthe resulting model on specific adversary-chosen inputs. However, most existing\ndefenses based on statistical differences take effect only against specific\nattacks, especially when the malicious gradients are similar to benign ones or\nthe data are highly non-independent and identically distributed (non-IID). In\nthis paper, we revisit the distance-based defense methods and discover that i)\nEuclidean distance becomes meaningless in high dimensions and ii) malicious\ngradients with diverse characteristics cannot be identified by a single metric.\nTo this end, we present a simple yet effective defense strategy with\nmulti-metrics and dynamic weighting to identify backdoors adaptively.\nFurthermore, our novel defense has no reliance on predefined assumptions over\nattack settings or data distributions and little impact on benign performance.\nTo evaluate the effectiveness of our approach, we conduct comprehensive\nexperiments on different datasets under various attack settings, where our\nmethod achieves the best defensive performance. For instance, we achieve the\nlowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing\nsignificant superiority over previous defenses. The results also demonstrate\nthat our method can be well-adapted to a wide range of non-IID degrees without\nsacrificing the benign performance.\n","authors":["Siquan Huang","Yijiang Li","Chong Chen","Leyu Shi","Ying Gao"],"pdf_url":"https://arxiv.org/pdf/2303.06601v2.pdf","comment":"14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International\n  Conference on Computer Vision (ICCV)"},{"id":"http://arxiv.org/abs/2303.08902v2","updated":"2023-08-10T06:44:54Z","published":"2023-03-15T19:37:33Z","title":"Learning ground states of gapped quantum Hamiltonians with Kernel\n  Methods","summary":"  Neural network approaches to approximate the ground state of quantum\nhamiltonians require the numerical solution of a highly nonlinear optimization\nproblem. We introduce a statistical learning approach that makes the\noptimization trivial by using kernel methods. Our scheme is an approximate\nrealization of the power method, where supervised learning is used to learn the\nnext step of the power iteration. We show that the ground state properties of\narbitrary gapped quantum hamiltonians can be reached with polynomial resources\nunder the assumption that the supervised learning is efficient. Using kernel\nridge regression, we provide numerical evidence that the learning assumption is\nverified by applying our scheme to find the ground states of several\nprototypical interacting many-body quantum systems, both in one and two\ndimensions, showing the flexibility of our approach.\n","authors":["Clemens Giuliani","Filippo Vicentini","Riccardo Rossi","Giuseppe Carleo"],"pdf_url":"https://arxiv.org/pdf/2303.08902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05374v1","updated":"2023-08-10T06:43:44Z","published":"2023-08-10T06:43:44Z","title":"Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language\n  Models' Alignment","summary":"  Ensuring alignment, which refers to making models behave in accordance with\nhuman intentions [1,2], has become a critical task before deploying large\nlanguage models (LLMs) in real-world applications. For instance, OpenAI devoted\nsix months to iteratively aligning GPT-4 before its release [3]. However, a\nmajor challenge faced by practitioners is the lack of clear guidance on\nevaluating whether LLM outputs align with social norms, values, and\nregulations. This obstacle hinders systematic iteration and deployment of LLMs.\nTo address this issue, this paper presents a comprehensive survey of key\ndimensions that are crucial to consider when assessing LLM trustworthiness. The\nsurvey covers seven major categories of LLM trustworthiness: reliability,\nsafety, fairness, resistance to misuse, explainability and reasoning, adherence\nto social norms, and robustness. Each major category is further divided into\nseveral sub-categories, resulting in a total of 29 sub-categories.\nAdditionally, a subset of 8 sub-categories is selected for further\ninvestigation, where corresponding measurement studies are designed and\nconducted on several widely-used LLMs. The measurement results indicate that,\nin general, more aligned models tend to perform better in terms of overall\ntrustworthiness. However, the effectiveness of alignment varies across the\ndifferent trustworthiness categories considered. This highlights the importance\nof conducting more fine-grained analyses, testing, and making continuous\nimprovements on LLM alignment. By shedding light on these key dimensions of LLM\ntrustworthiness, this paper aims to provide valuable insights and guidance to\npractitioners in the field. Understanding and addressing these concerns will be\ncrucial in achieving reliable and ethically sound deployment of LLMs in various\napplications.\n","authors":["Yang Liu","Yuanshun Yao","Jean-Francois Ton","Xiaoying Zhang","Ruocheng Guo Hao Cheng","Yegor Klochkov","Muhammad Faaiz Taufiq","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2308.05374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05371v1","updated":"2023-08-10T06:40:19Z","published":"2023-08-10T06:40:19Z","title":"Flexible Isosurface Extraction for Gradient-Based Mesh Optimization","summary":"  This work considers gradient-based mesh optimization, where we iteratively\noptimize for a 3D surface mesh by representing it as the isosurface of a scalar\nfield, an increasingly common paradigm in applications including\nphotogrammetry, generative modeling, and inverse physics. Existing\nimplementations adapt classic isosurface extraction algorithms like Marching\nCubes or Dual Contouring; these techniques were designed to extract meshes from\nfixed, known fields, and in the optimization setting they lack the degrees of\nfreedom to represent high-quality feature-preserving meshes, or suffer from\nnumerical instabilities. We introduce FlexiCubes, an isosurface representation\nspecifically designed for optimizing an unknown mesh with respect to geometric,\nvisual, or even physical objectives. Our main insight is to introduce\nadditional carefully-chosen parameters into the representation, which allow\nlocal flexible adjustments to the extracted mesh geometry and connectivity.\nThese parameters are updated along with the underlying scalar field via\nautomatic differentiation when optimizing for a downstream task. We base our\nextraction scheme on Dual Marching Cubes for improved topological properties,\nand present extensions to optionally generate tetrahedral and\nhierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on\nboth synthetic benchmarks and real-world applications, showing that it offers\nsignificant improvements in mesh quality and geometric fidelity.\n","authors":["Tianchang Shen","Jacob Munkberg","Jon Hasselgren","Kangxue Yin","Zian Wang","Wenzheng Chen","Zan Gojcic","Sanja Fidler","Nicholas Sharp","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05371v1.pdf","comment":"SIGGRAPH 2023. Project page:\n  https://research.nvidia.com/labs/toronto-ai/flexicubes/"},{"id":"http://arxiv.org/abs/2308.05364v1","updated":"2023-08-10T06:17:46Z","published":"2023-08-10T06:17:46Z","title":"Machine Learning aided Computer Architecture Design for CNN Inferencing\n  Systems","summary":"  Efficient and timely calculations of Machine Learning (ML) algorithms are\nessential for emerging technologies like autonomous driving, the Internet of\nThings (IoT), and edge computing. One of the primary ML algorithms used in such\nsystems is Convolutional Neural Networks (CNNs), which demand high\ncomputational resources. This requirement has led to the use of ML accelerators\nlike GPGPUs to meet design constraints. However, selecting the most suitable\naccelerator involves Design Space Exploration (DSE), a process that is usually\ntime-consuming and requires significant manual effort. Our work presents\napproaches to expedite the DSE process by identifying the most appropriate\nGPGPU for CNN inferencing systems. We have developed a quick and precise\ntechnique for forecasting the power and performance of CNNs during inference,\nwith a MAPE of 5.03% and 5.94%, respectively. Our approach empowers computer\narchitects to estimate power and performance in the early stages of\ndevelopment, reducing the necessity for numerous prototypes. This saves time\nand money while also improving the time-to-market period.\n","authors":["Christopher A. Metz"],"pdf_url":"https://arxiv.org/pdf/2308.05364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05362v1","updated":"2023-08-10T06:10:49Z","published":"2023-08-10T06:10:49Z","title":"FINER: Enhancing State-of-the-art Classifiers with Feature Attribution\n  to Facilitate Security Analysis","summary":"  Deep learning classifiers achieve state-of-the-art performance in various\nrisk detection applications. They explore rich semantic representations and are\nsupposed to automatically discover risk behaviors. However, due to the lack of\ntransparency, the behavioral semantics cannot be conveyed to downstream\nsecurity experts to reduce their heavy workload in security analysis. Although\nfeature attribution (FA) methods can be used to explain deep learning, the\nunderlying classifier is still blind to what behavior is suspicious, and the\ngenerated explanation cannot adapt to downstream tasks, incurring poor\nexplanation fidelity and intelligibility. In this paper, we propose FINER, the\nfirst framework for risk detection classifiers to generate high-fidelity and\nhigh-intelligibility explanations. The high-level idea is to gather explanation\nefforts from model developer, FA designer, and security experts. To improve\nfidelity, we fine-tune the classifier with an explanation-guided multi-task\nlearning strategy. To improve intelligibility, we engage task knowledge to\nadjust and ensemble FA methods. Extensive evaluations show that FINER improves\nexplanation quality for risk detection. Moreover, we demonstrate that FINER\noutperforms a state-of-the-art tool in facilitating malware analysis.\n","authors":["Yiling He","Jian Lou","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2308.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05353v1","updated":"2023-08-10T05:49:30Z","published":"2023-08-10T05:49:30Z","title":"Preemptive Detection of Fake Accounts on Social Networks via Multi-Class\n  Preferential Attachment Classifiers","summary":"  In this paper, we describe a new algorithm called Preferential Attachment\nk-class Classifier (PreAttacK) for detecting fake accounts in a social network.\nRecently, several algorithms have obtained high accuracy on this problem.\nHowever, they have done so by relying on information about fake accounts'\nfriendships or the content they share with others--the very things we seek to\nprevent.\n  PreAttacK represents a significant departure from these approaches. We\nprovide some of the first detailed distributional analyses of how new fake (and\nreal) accounts first attempt to request friends after joining a major network\n(Facebook). We show that even before a new account has made friends or shared\ncontent, these initial friend request behaviors evoke a natural multi-class\nextension of the canonical Preferential Attachment model of social network\ngrowth.\n  We use this model to derive a new algorithm, PreAttacK. We prove that in\nrelevant problem instances, PreAttacK near-optimally approximates the posterior\nprobability that a new account is fake under this multi-class Preferential\nAttachment model of new accounts' (not-yet-answered) friend requests. These are\nthe first provable guarantees for fake account detection that apply to new\nusers, and that do not require strong homophily assumptions.\n  This principled approach also makes PreAttacK the only algorithm with\nprovable guarantees that obtains state-of-the-art performance on new users on\nthe global Facebook network, where it converges to AUC=0.9 after new users send\n+ receive a total of just 20 not-yet-answered friend requests. For comparison,\nstate-of-the-art benchmarks do not obtain this AUC even after observing\nadditional data on new users' first 100 friend requests. Thus, unlike\nmainstream algorithms, PreAttacK converges before the median new fake account\nhas made a single friendship (accepted friend request) with a human.\n","authors":["Adam Breuer","Nazanin Khosravani","Michael Tingley","Bradford Cottel"],"pdf_url":"https://arxiv.org/pdf/2308.05353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05345v1","updated":"2023-08-10T05:24:41Z","published":"2023-08-10T05:24:41Z","title":"RTLLM: An Open-Source Benchmark for Design RTL Generation with Large\n  Language Model","summary":"  Inspired by the recent success of large language models (LLMs) like ChatGPT,\nresearchers start to explore the adoption of LLMs for agile hardware design,\nsuch as generating design RTL based on natural-language instructions. However,\nin existing works, their target designs are all relatively simple and in a\nsmall scale, and proposed by the authors themselves, making a fair comparison\namong different LLM solutions challenging. In addition, many prior works only\nfocus on the design correctness, without evaluating the design qualities of\ngenerated design RTL. In this work, we propose an open-source benchmark named\nRTLLM, for generating design RTL with natural language instructions. To\nsystematically evaluate the auto-generated design RTL, we summarized three\nprogressive goals, named syntax goal, functionality goal, and design quality\ngoal. This benchmark can automatically provide a quantitative evaluation of any\ngiven LLM-based solution. Furthermore, we propose an easy-to-use yet\nsurprisingly effective prompt engineering technique named self-planning, which\nproves to significantly boost the performance of GPT-3.5 in our proposed\nbenchmark.\n","authors":["Yao Lu","Shang Liu","Qijun Zhang","Zhiyao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09807v2","updated":"2023-08-10T05:13:33Z","published":"2023-06-16T12:44:10Z","title":"FALL-E: A Foley Sound Synthesis Model and Strategies","summary":"  This paper introduces FALL-E, a foley synthesis system and its\ntraining/inference strategies. The FALL-E model employs a cascaded approach\ncomprising low-resolution spectrogram generation, spectrogram super-resolution,\nand a vocoder. We trained every sound-related model from scratch using our\nextensive datasets, and utilized a pre-trained language model. We conditioned\nthe model with dataset-specific texts, enabling it to learn sound quality and\nrecording environment based on text input. Moreover, we leveraged external\nlanguage models to improve text descriptions of our datasets and performed\nprompt engineering for quality, coherence, and diversity. FALL-E was evaluated\nby an objective measure as well as listening tests in the DCASE 2023 challenge\nTask 7. The submission achieved the second place on average, while achieving\nthe best score for diversity, second place for audio quality, and third place\nfor class fitness.\n","authors":["Minsung Kang","Sangshin Oh","Hyeongi Moon","Kyungyun Lee","Ben Sangbae Chon"],"pdf_url":"https://arxiv.org/pdf/2306.09807v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2202.09518v3","updated":"2023-08-10T04:11:51Z","published":"2022-02-19T03:49:21Z","title":"Distributed Out-of-Memory NMF on CPU/GPU Architectures","summary":"  We propose an efficient distributed out-of-memory implementation of the\nNon-negative Matrix Factorization (NMF) algorithm for heterogeneous\nhigh-performance-computing (HPC) systems. The proposed implementation is based\non prior work on NMFk, which can perform automatic model selection and extract\nlatent variables and patterns from data. In this work, we extend NMFk by adding\nsupport for dense and sparse matrix operation on multi-node, multi-GPU systems.\nThe resulting algorithm is optimized for out-of-memory (OOM) problems where the\nmemory required to factorize a given matrix is greater than the available GPU\nmemory. Memory complexity is reduced by batching/tiling strategies, and sparse\nand dense matrix operations are significantly accelerated with GPU cores (or\ntensor cores when available). Input/Output (I/O) latency associated with batch\ncopies between host and device is hidden using CUDA streams to overlap data\ntransfers and compute asynchronously, and latency associated with collective\ncommunications (both intra-node and inter-node) is reduced using optimized\nNVIDIA Collective Communication Library NCCL based communicators. Benchmark\nresults show significant improvement, from 32X to 76x speedup, with the new\nimplementation using GPUs over the CPU-based NMFk. Good weak scaling was\ndemonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000\nGPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size\nsparse matrix of density 10e-6.\n","authors":["Ismael Boureima","Manish Bhattarai","Maksim Eren","Erik Skau","Philip Romero","Stephan Eidenbenz","Boian Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2202.09518v3.pdf","comment":"Accepted at Journal of Supercomputing"},{"id":"http://arxiv.org/abs/2308.05326v1","updated":"2023-08-10T04:01:04Z","published":"2023-08-10T04:01:04Z","title":"OpenProteinSet: Training data for structural biology at scale","summary":"  Multiple sequence alignments (MSAs) of proteins encode rich biological\ninformation and have been workhorses in bioinformatic methods for tasks like\nprotein design and protein structure prediction for decades. Recent\nbreakthroughs like AlphaFold2 that use transformers to attend directly over\nlarge quantities of raw MSAs have reaffirmed their importance. Generation of\nMSAs is highly computationally intensive, however, and no datasets comparable\nto those used to train AlphaFold2 have been made available to the research\ncommunity, hindering progress in machine learning for proteins. To remedy this\nproblem, we introduce OpenProteinSet, an open-source corpus of more than 16\nmillion MSAs, associated structural homologs from the Protein Data Bank, and\nAlphaFold2 protein structure predictions. We have previously demonstrated the\nutility of OpenProteinSet by successfully retraining AlphaFold2 on it. We\nexpect OpenProteinSet to be broadly useful as training and validation data for\n1) diverse tasks focused on protein structure, function, and design and 2)\nlarge-scale multimodal machine learning research.\n","authors":["Gustaf Ahdritz","Nazim Bouatta","Sachin Kadyan","Lukas Jarosch","Daniel Berenberg","Ian Fisk","Andrew M. Watkins","Stephen Ra","Richard Bonneau","Mohammed AlQuraishi"],"pdf_url":"https://arxiv.org/pdf/2308.05326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09797v5","updated":"2023-08-10T03:41:04Z","published":"2023-04-19T16:29:48Z","title":"Progressive-Hint Prompting Improves Reasoning in Large Language Models","summary":"  The performance of Large Language Models (LLMs) in reasoning tasks depends\nheavily on prompt design, with Chain-of-Thought (CoT) and self-consistency\nbeing critical methods that enhance this ability. However, these methods do not\nfully exploit the answers generated by the LLM to guide subsequent responses.\nThis paper proposes a new prompting method, named Progressive-Hint Prompting\n(PHP), that enables automatic multiple interactions between users and LLMs by\nusing previously generated answers as hints to progressively guide toward the\ncorrect answers. PHP is orthogonal to CoT and self-consistency, making it easy\nto combine with state-of-the-art techniques to further improve performance. We\nconducted extensive and comprehensive experiments on seven benchmarks. The\nresults show that PHP significantly improves accuracy while remaining highly\nefficient. For instance, with text-davinci-003, we observed a 4.2% improvement\non GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction\nin sample paths with self-consistency. With GPT-4 and PHP, we achieve\nstate-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),\nAQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).\n","authors":["Chuanyang Zheng","Zhengying Liu","Enze Xie","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2304.09797v5.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2308.04704v2","updated":"2023-08-10T03:08:03Z","published":"2023-08-09T04:51:28Z","title":"A Feature Set of Small Size for the PDF Malware Detection","summary":"  Machine learning (ML)-based malware detection systems are becoming\nincreasingly important as malware threats increase and get more sophisticated.\nPDF files are often used as vectors for phishing attacks because they are\nwidely regarded as trustworthy data resources, and are accessible across\ndifferent platforms. Therefore, researchers have developed many different PDF\nmalware detection methods. Performance in detecting PDF malware is greatly\ninfluenced by feature selection. In this research, we propose a small features\nset that don't require too much domain knowledge of the PDF file. We evaluate\nproposed features with six different machine learning models. We report the\nbest accuracy of 99.75% when using Random Forest model. Our proposed feature\nset, which consists of just 12 features, is one of the most conciseness in the\nfield of PDF malware detection. Despite its modest size, we obtain comparable\nresults to state-of-the-art that employ a much larger set of features.\n","authors":["Ran Liu","Charles Nicholas"],"pdf_url":"https://arxiv.org/pdf/2308.04704v2.pdf","comment":"Accepted for publication at the ACM SIGKDD & Annual KDD Conference\n  workshop on Knowledge-infused Machine Learning, 2023"},{"id":"http://arxiv.org/abs/2210.13662v2","updated":"2023-08-10T03:02:21Z","published":"2022-10-24T23:50:12Z","title":"Analyzing Privacy Leakage in Machine Learning via Multiple Hypothesis\n  Testing: A Lesson From Fano","summary":"  Differential privacy (DP) is by far the most widely accepted framework for\nmitigating privacy risks in machine learning. However, exactly how small the\nprivacy parameter $\\epsilon$ needs to be to protect against certain privacy\nrisks in practice is still not well-understood. In this work, we study data\nreconstruction attacks for discrete data and analyze it under the framework of\nmultiple hypothesis testing. We utilize different variants of the celebrated\nFano's inequality to derive upper bounds on the inferential power of a data\nreconstruction adversary when the model is trained differentially privately.\nImportantly, we show that if the underlying private data takes values from a\nset of size $M$, then the target privacy parameter $\\epsilon$ can be $O(\\log\nM)$ before the adversary gains significant inferential power. Our analysis\noffers theoretical evidence for the empirical effectiveness of DP against data\nreconstruction attacks even at relatively large values of $\\epsilon$.\n","authors":["Chuan Guo","Alexandre Sablayrolles","Maziar Sanjabi"],"pdf_url":"https://arxiv.org/pdf/2210.13662v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03942v3","updated":"2023-08-10T02:55:51Z","published":"2022-11-08T01:36:15Z","title":"Privacy-Aware Compression for Federated Learning Through Numerical\n  Mechanism Design","summary":"  In private federated learning (FL), a server aggregates differentially\nprivate updates from a large number of clients in order to train a machine\nlearning model. The main challenge in this setting is balancing privacy with\nboth classification accuracy of the learnt model as well as the number of bits\ncommunicated between the clients and server. Prior work has achieved a good\ntrade-off by designing a privacy-aware compression mechanism, called the\nminimum variance unbiased (MVU) mechanism, that numerically solves an\noptimization problem to determine the parameters of the mechanism. This paper\nbuilds upon it by introducing a new interpolation procedure in the numerical\ndesign process that allows for a far more efficient privacy analysis. The\nresult is the new Interpolated MVU mechanism that is more scalable, has a\nbetter privacy-utility trade-off, and provides SOTA results on\ncommunication-efficient private FL on a variety of datasets.\n","authors":["Chuan Guo","Kamalika Chaudhuri","Pierre Stock","Mike Rabbat"],"pdf_url":"https://arxiv.org/pdf/2211.03942v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05309v1","updated":"2023-08-10T02:53:30Z","published":"2023-08-10T02:53:30Z","title":"Homophily-enhanced Structure Learning for Graph Clustering","summary":"  Graph clustering is a fundamental task in graph analysis, and recent advances\nin utilizing graph neural networks (GNNs) have shown impressive results.\nDespite the success of existing GNN-based graph clustering methods, they often\noverlook the quality of graph structure, which is inherent in real-world graphs\ndue to their sparse and multifarious nature, leading to subpar performance.\nGraph structure learning allows refining the input graph by adding missing\nlinks and removing spurious connections. However, previous endeavors in graph\nstructure learning have predominantly centered around supervised settings, and\ncannot be directly applied to our specific clustering tasks due to the absence\nof ground-truth labels. To bridge the gap, we propose a novel method called\n\\textbf{ho}mophily-enhanced structure \\textbf{le}arning for graph clustering\n(HoLe). Our motivation stems from the observation that subtly enhancing the\ndegree of homophily within the graph structure can significantly improve GNNs\nand clustering outcomes. To realize this objective, we develop two\nclustering-oriented structure learning modules, i.e., hierarchical correlation\nestimation and cluster-aware sparsification. The former module enables a more\naccurate estimation of pairwise node relationships by leveraging guidance from\nlatent and clustering spaces, while the latter one generates a sparsified\nstructure based on the similarity matrix and clustering assignments.\nAdditionally, we devise a joint optimization approach alternating between\ntraining the homophily-enhanced structure learning and GNN-based clustering,\nthereby enforcing their reciprocal effects. Extensive experiments on seven\nbenchmark datasets of various types and scales, across a range of clustering\nmetrics, demonstrate the superiority of HoLe against state-of-the-art\nbaselines.\n","authors":["Ming Gu","Gaoming Yang","Sheng Zhou","Ning Ma","Jiawei Chen","Qiaoyu Tan","Meihan Liu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2308.05309v1.pdf","comment":"11 pages with 7 figures"},{"id":"http://arxiv.org/abs/2308.05305v1","updated":"2023-08-10T02:48:57Z","published":"2023-08-10T02:48:57Z","title":"From CNN to Transformer: A Review of Medical Image Segmentation Models","summary":"  Medical image segmentation is an important step in medical image analysis,\nespecially as a crucial prerequisite for efficient disease diagnosis and\ntreatment. The use of deep learning for image segmentation has become a\nprevalent trend. The widely adopted approach currently is U-Net and its\nvariants. Additionally, with the remarkable success of pre-trained models in\nnatural language processing tasks, transformer-based models like TransUNet have\nachieved desirable performance on multiple medical image segmentation datasets.\nIn this paper, we conduct a survey of the most representative four medical\nimage segmentation models in recent years. We theoretically analyze the\ncharacteristics of these models and quantitatively evaluate their performance\non two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).\nFinally, we discuss the main challenges and future trends in medical image\nsegmentation. Our work can assist researchers in the related field to quickly\nestablish medical segmentation models tailored to specific regions.\n","authors":["Wenjian Yao","Jiajun Bai","Wei Liao","Yuheng Chen","Mengjuan Liu","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05305v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.16361v2","updated":"2023-08-10T02:45:55Z","published":"2023-07-31T01:34:24Z","title":"Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks\n  for Defending Adversarial Examples","summary":"  Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to\nadversarial examples, threatening their practical deployment. Despite the many\nresearch endeavors have been made to tackle this issue in recent years, the\ndiversity of adversarial examples on 3D point clouds makes them more\nchallenging to defend against than those on 2D images. For examples, attackers\ncan generate adversarial examples by adding, shifting, or removing points.\nConsequently, existing defense strategies are hard to counter unseen point\ncloud adversarial examples. In this paper, we first establish a comprehensive,\nand rigorous point cloud adversarial robustness benchmark to evaluate\nadversarial robustness, which can provide a detailed understanding of the\neffects of the defense and attack methods. We then collect existing defense\ntricks in point cloud adversarial defenses and then perform extensive and\nsystematic experiments to identify an effective combination of these tricks.\nFurthermore, we propose a hybrid training augmentation methods that consider\nvarious types of point cloud adversarial examples to adversarial training,\nsignificantly improving the adversarial robustness. By combining these tricks,\nwe construct a more robust defense framework achieving an average accuracy of\n83.45\\% against various attacks, demonstrating its capability to enabling\nrobust learners. Our codebase are open-sourced on:\n\\url{https://github.com/qiufan319/benchmark_pc_attack.git}.\n","authors":["Qiufan Ji","Lin Wang","Cong Shi","Shengshan Hu","Yingying Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16361v2.pdf","comment":"8 pages 6 figures"},{"id":"http://arxiv.org/abs/2308.05292v1","updated":"2023-08-10T02:14:23Z","published":"2023-08-10T02:14:23Z","title":"Byzantine-Robust Decentralized Stochastic Optimization with Stochastic\n  Gradient Noise-Independent Learning Error","summary":"  This paper studies Byzantine-robust stochastic optimization over a\ndecentralized network, where every agent periodically communicates with its\nneighbors to exchange local models, and then updates its own local model by\nstochastic gradient descent (SGD). The performance of such a method is affected\nby an unknown number of Byzantine agents, which conduct adversarially during\nthe optimization process. To the best of our knowledge, there is no existing\nwork that simultaneously achieves a linear convergence speed and a small\nlearning error. We observe that the learning error is largely dependent on the\nintrinsic stochastic gradient noise. Motivated by this observation, we\nintroduce two variance reduction methods, stochastic average gradient algorithm\n(SAGA) and loopless stochastic variance-reduced gradient (LSVRG), to\nByzantine-robust decentralized stochastic optimization for eliminating the\nnegative effect of the stochastic gradient noise. The two resulting methods,\nBRAVO-SAGA and BRAVO-LSVRG, enjoy both linear convergence speeds and stochastic\ngradient noise-independent learning errors. Such learning errors are optimal\nfor a class of methods based on total variation (TV)-norm regularization and\nstochastic subgradient update. We conduct extensive numerical experiments to\ndemonstrate their effectiveness under various Byzantine attacks.\n","authors":["Jie Peng","Weiyu Li","Qing Ling"],"pdf_url":"https://arxiv.org/pdf/2308.05292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n  wildfire season","summary":"  Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14527v2","updated":"2023-08-10T01:46:11Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n  Patricia Wu-Murad","summary":"  This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05275v1","updated":"2023-08-10T01:25:28Z","published":"2023-08-10T01:25:28Z","title":"Cross-heterogeneity Graph Few-shot Learning","summary":"  In recent years, heterogeneous graph few-shot learning has been proposed to\naddress the label sparsity issue in heterogeneous graphs (HGs), which contain\nvarious types of nodes and edges. The existing methods have achieved good\nperformance by transferring generalized knowledge extracted from rich-labeled\nclasses in source HG(s) to few-labeled classes in a target HG. However, these\nmethods only consider the single-heterogeneity scenario where the source and\ntarget HGs share a fixed set of node/edge types, ignoring the more general\nscenario of cross-heterogeneity, where each HG can have a different and\nnon-fixed set of node/edge types. To this end, we focus on the unexplored\ncross-heterogeneity scenario and propose a novel model for Cross-heterogeneity\nGraph Few-shot Learning, namely CGFL. In CGFL, we first extract meta-patterns\nto capture heterogeneous information and propose a multi-view heterogeneous\ngraph neural network (MHGN) to learn meta-patterns across HGs. Then, we propose\na score module to measure the informativeness of labeled samples and determine\nthe transferability of each source HG. Finally, by integrating MHGN and the\nscore module into a meta-learning mechanism, CGFL can effectively transfer\ngeneralized knowledge to predict new classes with few-labeled data. Extensive\nexperiments on four real-world datasets have demonstrated the superior\nperformance of CGFL over the state-of-the-art methods.\n","authors":["Pengfei Ding","Yan Wang","Guanfeng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05353v1","updated":"2023-08-10T05:49:30Z","published":"2023-08-10T05:49:30Z","title":"Preemptive Detection of Fake Accounts on Social Networks via Multi-Class\n  Preferential Attachment Classifiers","summary":"  In this paper, we describe a new algorithm called Preferential Attachment\nk-class Classifier (PreAttacK) for detecting fake accounts in a social network.\nRecently, several algorithms have obtained high accuracy on this problem.\nHowever, they have done so by relying on information about fake accounts'\nfriendships or the content they share with others--the very things we seek to\nprevent. PreAttacK represents a significant departure from these approaches. We\nprovide some of the first detailed distributional analyses of how new fake (and\nreal) accounts first attempt to request friends after joining a major network\n(Facebook). We show that even before a new account has made friends or shared\ncontent, these initial friend request behaviors evoke a natural multi-class\nextension of the canonical Preferential Attachment model of social network\ngrowth. We use this model to derive a new algorithm, PreAttacK. We prove that\nin relevant problem instances, PreAttacK near-optimally approximates the\nposterior probability that a new account is fake under this multi-class\nPreferential Attachment model of new accounts' (not-yet-answered) friend\nrequests. These are the first provable guarantees for fake account detection\nthat apply to new users, and that do not require strong homophily assumptions.\nThis principled approach also makes PreAttacK the only algorithm with provable\nguarantees that obtains state-of-the-art performance on new users on the global\nFacebook network, where it converges to AUC=0.9 after new users send + receive\na total of just 20 not-yet-answered friend requests. For comparison,\nstate-of-the-art benchmarks do not obtain this AUC even after observing\nadditional data on new users' first 100 friend requests. Thus, unlike\nmainstream algorithms, PreAttacK converges before the median new fake account\nhas made a single friendship (accepted friend request) with a human.\n","authors":["Adam Breuer","Nazanin Khosravani","Michael Tingley","Bradford Cottel"],"pdf_url":"https://arxiv.org/pdf/2308.05353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05882v1","updated":"2023-08-10T23:54:12Z","published":"2023-08-10T23:54:12Z","title":"GPLaSDI: Gaussian Process-based Interpretable Latent Space Dynamics\n  Identification through Deep Autoencoder","summary":"  Numerically solving partial differential equations (PDEs) can be challenging\nand computationally expensive. This has led to the development of reduced-order\nmodels (ROMs) that are accurate but faster than full order models (FOMs).\nRecently, machine learning advances have enabled the creation of non-linear\nprojection methods, such as Latent Space Dynamics Identification (LaSDI). LaSDI\nmaps full-order PDE solutions to a latent space using autoencoders and learns\nthe system of ODEs governing the latent space dynamics. By interpolating and\nsolving the ODE system in the reduced latent space, fast and accurate ROM\npredictions can be made by feeding the predicted latent space dynamics into the\ndecoder. In this paper, we introduce GPLaSDI, a novel LaSDI-based framework\nthat relies on Gaussian process (GP) for latent space ODE interpolations. Using\nGPs offers two significant advantages. First, it enables the quantification of\nuncertainty over the ROM predictions. Second, leveraging this prediction\nuncertainty allows for efficient adaptive training through a greedy selection\nof additional training data points. This approach does not require prior\nknowledge of the underlying PDEs. Consequently, GPLaSDI is inherently\nnon-intrusive and can be applied to problems without a known PDE or its\nresidual. We demonstrate the effectiveness of our approach on the Burgers\nequation, Vlasov equation for plasma physics, and a rising thermal bubble\nproblem. Our proposed method achieves between 200 and 100,000 times speed-up,\nwith up to 7% relative error.\n","authors":["Christophe Bonneville","Youngsoo Choi","Debojyoti Ghosh","Jonathan L. Belof"],"pdf_url":"https://arxiv.org/pdf/2308.05882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05881v1","updated":"2023-08-10T23:53:07Z","published":"2023-08-10T23:53:07Z","title":"Aphid Cluster Recognition and Detection in the Wild Using Deep Learning\n  Models","summary":"  Aphid infestation poses a significant threat to crop production, rural\ncommunities, and global food security. While chemical pest control is crucial\nfor maximizing yields, applying chemicals across entire fields is both\nenvironmentally unsustainable and costly. Hence, precise localization and\nmanagement of aphids are essential for targeted pesticide application. The\npaper primarily focuses on using deep learning models for detecting aphid\nclusters. We propose a novel approach for estimating infection levels by\ndetecting aphid clusters. To facilitate this research, we have captured a\nlarge-scale dataset from sorghum fields, manually selected 5,447 images\ncontaining aphids, and annotated each individual aphid cluster within these\nimages. To facilitate the use of machine learning models, we further process\nthe images by cropping them into patches, resulting in a labeled dataset\ncomprising 151,380 image patches. Then, we implemented and compared the\nperformance of four state-of-the-art object detection models (VFNet, GFLV2,\nPAA, and ATSS) on the aphid dataset. Extensive experimental results show that\nall models yield stable similar performance in terms of average precision and\nrecall. We then propose to merge close neighboring clusters and remove tiny\nclusters caused by cropping, and the performance is further boosted by around\n17%. The study demonstrates the feasibility of automatically detecting and\nmanaging insects using machine learning models. The labeled dataset will be\nmade openly available to the research community.\n","authors":["Tianxiao Zhang","Kaidong Li","Xiangyu Chen","Cuncong Zhong","Bo Luo","Ivan Grijalva","Brian McCornack","Daniel Flippo","Ajay Sharda","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05878v1","updated":"2023-08-10T23:24:51Z","published":"2023-08-10T23:24:51Z","title":"Composable Core-sets for Diversity Approximation on Multi-Dataset\n  Streams","summary":"  Core-sets refer to subsets of data that maximize some function that is\ncommonly a diversity or group requirement. These subsets are used in place of\nthe original data to accomplish a given task with comparable or even enhanced\nperformance if biases are removed. Composable core-sets are core-sets with the\nproperty that subsets of the core set can be unioned together to obtain an\napproximation for the original data; lending themselves to be used for streamed\nor distributed data. Recent work has focused on the use of core-sets for\ntraining machine learning models. Preceding solutions such as CRAIG have been\nproven to approximate gradient descent while providing a reduced training time.\nIn this paper, we introduce a core-set construction algorithm for constructing\ncomposable core-sets to summarize streamed data for use in active learning\nenvironments. If combined with techniques such as CRAIG and heuristics to\nenhance construction speed, composable core-sets could be used for real time\ntraining of models when the amount of sensor data is large. We provide\nempirical analysis by considering extrapolated data for the runtime of such a\nbrute force algorithm. This algorithm is then analyzed for efficiency through\naveraged empirical regression and key results and improvements are suggested\nfor further research on the topic.\n","authors":["Stephanie Wang","Michael Flynn","Fangyu Luo"],"pdf_url":"https://arxiv.org/pdf/2308.05878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05877v1","updated":"2023-08-10T23:22:41Z","published":"2023-08-10T23:22:41Z","title":"Revisiting N-CNN for Clinical Practice","summary":"  This paper revisits the Neonatal Convolutional Neural Network (N-CNN) by\noptimizing its hyperparameters and evaluating how they affect its\nclassification metrics, explainability and reliability, discussing their\npotential impact in clinical practice. We have chosen hyperparameters that do\nnot modify the original N-CNN architecture, but mainly modify its learning rate\nand training regularization. The optimization was done by evaluating the\nimprovement in F1 Score for each hyperparameter individually, and the best\nhyperparameters were chosen to create a Tuned N-CNN. We also applied soft\nlabels derived from the Neonatal Facial Coding System, proposing a novel\napproach for training facial expression classification models for neonatal pain\nassessment. Interestingly, while the Tuned N-CNN results point towards\nimprovements in classification metrics and explainability, these improvements\ndid not directly translate to calibration performance. We believe that such\ninsights might have the potential to contribute to the development of more\nreliable pain evaluation tools for newborns, aiding healthcare professionals in\ndelivering appropriate interventions and improving patient outcomes.\n","authors":["Leonardo Antunes Ferreira","Lucas Pereira Carlini","Gabriel de Almeida Sá Coutrin","Tatiany Marcondes Heideirich","Marina Carvalho de Moraes Barros","Ruth Guinsburg","Carlos Eduardo Thomaz"],"pdf_url":"https://arxiv.org/pdf/2308.05877v1.pdf","comment":"AICAI 2023 in conjuction with MICCAI"},{"id":"http://arxiv.org/abs/2308.05870v1","updated":"2023-08-10T22:52:13Z","published":"2023-08-10T22:52:13Z","title":"UFed-GAN: A Secure Federated Learning Framework with Constrained\n  Computation and Unlabeled Data","summary":"  To satisfy the broad applications and insatiable hunger for deploying low\nlatency multimedia data classification and data privacy in a cloud-based\nsetting, federated learning (FL) has emerged as an important learning paradigm.\nFor the practical cases involving limited computational power and only\nunlabeled data in many wireless communications applications, this work\ninvestigates FL paradigm in a resource-constrained and label-missing\nenvironment. Specifically, we propose a novel framework of UFed-GAN:\nUnsupervised Federated Generative Adversarial Network, which can capture\nuser-side data distribution without local classification training. We also\nanalyze the convergence and privacy of the proposed UFed-GAN. Our experimental\nresults demonstrate the strong potential of UFed-GAN in addressing limited\ncomputational resources and unlabeled data while preserving privacy.\n","authors":["Achintha Wijesinghe","Songyang Zhang","Siyu Qi","Zhi Ding"],"pdf_url":"https://arxiv.org/pdf/2308.05870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05866v1","updated":"2023-08-10T22:30:24Z","published":"2023-08-10T22:30:24Z","title":"Using Twitter Data to Determine Hurricane Category: An Experiment","summary":"  Social media posts contain an abundant amount of information about public\nopinion on major events, especially natural disasters such as hurricanes. Posts\nrelated to an event, are usually published by the users who live near the place\nof the event at the time of the event. Special correlation between the social\nmedia data and the events can be obtained using data mining approaches. This\npaper presents research work to find the mappings between social media data and\nthe severity level of a disaster. Specifically, we have investigated the\nTwitter data posted during hurricanes Harvey and Irma, and attempted to find\nthe correlation between the Twitter data of a specific area and the hurricane\nlevel in that area. Our experimental results indicate a positive correlation\nbetween them. We also present a method to predict the hurricane category for a\nspecific area using relevant Twitter data.\n","authors":["Songhui Yue","Jyothsna Kondari","Aibek Musaev","Randy K. Smith","Songqing Yue"],"pdf_url":"https://arxiv.org/pdf/2308.05866v1.pdf","comment":"9 Pages, 6 Figures, in Proceedings of the 15th ISCRAM Conference\n  Rochester, NY, USA May 2018"},{"id":"http://arxiv.org/abs/2307.16104v3","updated":"2023-08-10T22:25:20Z","published":"2023-07-30T01:49:21Z","title":"AI Increases Global Access to Reliable Flood Forecasts","summary":"  Floods are one of the most common and impactful natural disasters, with a\ndisproportionate impact in developing countries that often lack dense\nstreamflow monitoring networks. Accurate and timely warnings are critical for\nmitigating flood risks, but accurate hydrological simulation models typically\nmust be calibrated to long data records in each watershed where they are\napplied. We developed an Artificial Intelligence (AI) model to predict extreme\nhydrological events at timescales up to 7 days in advance. This model\nsignificantly outperforms current state of the art global hydrology models (the\nCopernicus Emergency Management Service Global Flood Awareness System) across\nall continents, lead times, and return periods. AI is especially effective at\nforecasting in ungauged basins, which is important because only a few percent\nof the world's watersheds have stream gauges, with a disproportionate number of\nungauged basins in developing countries that are especially vulnerable to the\nhuman impacts of flooding. We produce forecasts of extreme events in South\nAmerica and Africa that achieve reliability approaching the current state of\nthe art in Europe and North America, and we achieve reliability at between 4\nand 6-day lead times that are similar to current state of the art nowcasts\n(0-day lead time). Additionally, we achieve accuracies over 10-year return\nperiod events that are similar to current accuracies over 2-year return period\nevents, meaning that AI can provide warnings earlier and over larger and more\nimpactful events. The model that we develop in this paper has been incorporated\ninto an operational early warning system that produces publicly available (free\nand open) forecasts in real time in over 80 countries. This work using AI and\nopen data highlights a need for increasing the availability of hydrological\ndata to continue to improve global access to reliable flood warnings.\n","authors":["Grey Nearing","Deborah Cohen","Vusumuzi Dube","Martin Gauch","Oren Gilon","Shaun Harrigan","Avinatan Hassidim","Frederik Kratzert","Asher Metzger","Sella Nevo","Florian Pappenberger","Christel Prudhomme","Guy Shalev","Shlomo Shenzis","Tadele Tekalign","Dana Weitzner","Yoss Matias"],"pdf_url":"https://arxiv.org/pdf/2307.16104v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03894v4","updated":"2023-08-10T22:00:30Z","published":"2021-10-08T05:07:35Z","title":"Neural Model Reprogramming with Similarity Based Mapping for\n  Low-Resource Spoken Command Classification","summary":"  In this study, we propose a novel adversarial reprogramming (AR) approach for\nlow-resource spoken command recognition (SCR), and build an AR-SCR system. The\nAR procedure aims to modify the acoustic signals (from the target domain) to\nrepurpose a pretrained SCR model (from the source domain). To solve the label\nmismatches between source and target domains, and further improve the stability\nof AR, we propose a novel similarity-based label mapping technique to align\nclasses. In addition, the transfer learning (TL) technique is combined with the\noriginal AR process to improve the model adaptation capability. We evaluate the\nproposed AR-SCR system on three low-resource SCR datasets, including Arabic,\nLithuanian, and dysarthric Mandarin speech. Experimental results show that with\na pretrained AM trained on a large-scale English dataset, the proposed AR-SCR\nsystem outperforms the current state-of-the-art results on Arabic and\nLithuanian speech commands datasets, with only a limited amount of training\ndata.\n","authors":["Hao Yen","Pin-Jui Ku","Chao-Han Huck Yang","Hu Hu","Sabato Marco Siniscalchi","Pin-Yu Chen","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2110.03894v4.pdf","comment":"Accepted to Interspeech 2023. Code is available at:\n  https://github.com/dodohow1011/SpeechAdvReprogram"},{"id":"http://arxiv.org/abs/2308.05864v1","updated":"2023-08-10T21:59:23Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n  Solutions","summary":"  Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyperparameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods, but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Zhuoshi Li","Chao Zuo","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v1.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n  https://neurips22-cellseg.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2003.03229v5","updated":"2023-08-10T21:19:32Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v5.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2308.05857v1","updated":"2023-08-10T21:06:18Z","published":"2023-08-10T21:06:18Z","title":"Knowledge Propagation over Conditional Independence Graphs","summary":"  Conditional Independence (CI) graph is a special type of a Probabilistic\nGraphical Model (PGM) where the feature connections are modeled using an\nundirected graph and the edge weights show the partial correlation strength\nbetween the features. Since the CI graphs capture direct dependence between\nfeatures, they have been garnering increasing interest within the research\ncommunity for gaining insights into the systems from various domains, in\nparticular discovering the domain topology. In this work, we propose algorithms\nfor performing knowledge propagation over the CI graphs. Our experiments\ndemonstrate that our techniques improve upon the state-of-the-art on the\npublicly available Cora and PubMed datasets.\n","authors":["Urszula Chajewska","Harsh Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2308.05857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08427v2","updated":"2023-08-10T20:37:20Z","published":"2023-01-20T05:39:26Z","title":"Which Features are Learned by CodeBert: An Empirical Study of the\n  BERT-based Source Code Representation Learning","summary":"  The Bidirectional Encoder Representations from Transformers (BERT) were\nproposed in the natural language process (NLP) and shows promising results.\nRecently researchers applied the BERT to source-code representation learning\nand reported some good news on several downstream tasks. However, in this\npaper, we illustrated that current methods cannot effectively understand the\nlogic of source codes. The representation of source code heavily relies on the\nprogrammer-defined variable and function names. We design and implement a set\nof experiments to demonstrate our conjecture and provide some insights for\nfuture works.\n","authors":["Lan Zhang","Chen Cao","Zhilong Wang","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.08427v2.pdf","comment":"1 table, 2 figures"},{"id":"http://arxiv.org/abs/2308.05843v1","updated":"2023-08-10T19:51:00Z","published":"2023-08-10T19:51:00Z","title":"GaborPINN: Efficient physics informed neural networks using\n  multiplicative filtered networks","summary":"  The computation of the seismic wavefield by solving the Helmholtz equation is\ncrucial to many practical applications, e.g., full waveform inversion.\nPhysics-informed neural networks (PINNs) provide functional wavefield solutions\nrepresented by neural networks (NNs), but their convergence is slow. To address\nthis problem, we propose a modified PINN using multiplicative filtered\nnetworks, which embeds some of the known characteristics of the wavefield in\ntraining, e.g., frequency, to achieve much faster convergence. Specifically, we\nuse the Gabor basis function due to its proven ability to represent wavefields\naccurately and refer to the implementation as GaborPINN. Meanwhile, we\nincorporate prior information on the frequency of the wavefield into the design\nof the method to mitigate the influence of the discontinuity of the represented\nwavefield by GaborPINN. The proposed method achieves up to a two-magnitude\nincrease in the speed of convergence as compared with conventional PINNs.\n","authors":["Xinquan Huang","Tariq Alkhalifah"],"pdf_url":"https://arxiv.org/pdf/2308.05843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05832v1","updated":"2023-08-10T19:29:44Z","published":"2023-08-10T19:29:44Z","title":"FLShield: A Validation Based Federated Learning Framework to Defend\n  Against Poisoning Attacks","summary":"  Federated learning (FL) is revolutionizing how we learn from data. With its\ngrowing popularity, it is now being used in many safety-critical domains such\nas autonomous vehicles and healthcare. Since thousands of participants can\ncontribute in this collaborative setting, it is, however, challenging to ensure\nsecurity and reliability of such systems. This highlights the need to design FL\nsystems that are secure and robust against malicious participants' actions\nwhile also ensuring high utility, privacy of local data, and efficiency. In\nthis paper, we propose a novel FL framework dubbed as FLShield that utilizes\nbenign data from FL participants to validate the local models before taking\nthem into account for generating the global model. This is in stark contrast\nwith existing defenses relying on server's access to clean datasets -- an\nassumption often impractical in real-life scenarios and conflicting with the\nfundamentals of FL. We conduct extensive experiments to evaluate our FLShield\nframework in different settings and demonstrate its effectiveness in thwarting\nvarious types of poisoning and backdoor attacks including a defense-aware one.\nFLShield also preserves privacy of local data against gradient inversion\nattacks.\n","authors":["Ehsanul Kabir","Zeyu Song","Md Rafi Ur Rashid","Shagufta Mehnaz"],"pdf_url":"https://arxiv.org/pdf/2308.05832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02266v2","updated":"2023-08-10T18:55:20Z","published":"2023-03-03T23:46:25Z","title":"Collaborative Learning with a Drone Orchestrator","summary":"  In this paper, the problem of drone-assisted collaborative learning is\nconsidered. In this scenario, swarm of intelligent wireless devices train a\nshared neural network (NN) model with the help of a drone. Using its sensors,\neach device records samples from its environment to gather a local dataset for\ntraining. The training data is severely heterogeneous as various devices have\ndifferent amount of data and sensor noise level. The intelligent devices\niteratively train the NN on their local datasets and exchange the model\nparameters with the drone for aggregation. For this system, the convergence\nrate of collaborative learning is derived while considering data heterogeneity,\nsensor noise levels, and communication errors, then, the drone trajectory that\nmaximizes the final accuracy of the trained NN is obtained. The proposed\ntrajectory optimization approach is aware of both the devices data\ncharacteristics (i.e., local dataset size and noise level) and their wireless\nchannel conditions, and significantly improves the convergence rate and final\naccuracy in comparison with baselines that only consider data characteristics\nor channel conditions. Compared to state-of-the-art baselines, the proposed\napproach achieves an average 3.85% and 3.54% improvement in the final accuracy\nof the trained NN on benchmark datasets for image recognition and semantic\nsegmentation tasks, respectively. Moreover, the proposed framework achieves a\nsignificant speedup in training, leading to an average 24% and 87% saving in\nthe drone hovering time, communication overhead, and battery usage,\nrespectively for these tasks.\n","authors":["Mahdi Boloursaz Mashhadi","Mahnoosh Mahdavimoghadam","Rahim Tafazolli","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2303.02266v2.pdf","comment":"Accepted at the IEEE"},{"id":"http://arxiv.org/abs/2308.02080v2","updated":"2023-08-10T18:32:56Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n  Detection","summary":"  Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17316v2","updated":"2023-08-10T18:32:07Z","published":"2022-10-26T21:03:17Z","title":"There is more than one kind of robustness: Fooling Whisper with\n  adversarial examples","summary":"  Whisper is a recent Automatic Speech Recognition (ASR) model displaying\nimpressive robustness to both out-of-distribution inputs and random noise. In\nthis work, we show that this robustness does not carry over to adversarial\nnoise. We show that we can degrade Whisper performance dramatically, or even\ntranscribe a target sentence of our choice, by generating very small input\nperturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling\nthe Whisper language detector we can very easily degrade the performance of\nmultilingual models. These vulnerabilities of a widely popular open-source\nmodel have practical security implications and emphasize the need for\nadversarially robust ASR.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2210.17316v2.pdf","comment":"Accepted at InterSpeech 2023"},{"id":"http://arxiv.org/abs/2207.04129v3","updated":"2023-08-10T18:28:08Z","published":"2022-07-08T21:25:17Z","title":"How many perturbations break this model? Evaluating robustness beyond\n  adversarial accuracy","summary":"  Robustness to adversarial attacks is typically evaluated with adversarial\naccuracy. While essential, this metric does not capture all aspects of\nrobustness and in particular leaves out the question of how many perturbations\ncan be found for each point. In this work, we introduce an alternative\napproach, adversarial sparsity, which quantifies how difficult it is to find a\nsuccessful perturbation given both an input point and a constraint on the\ndirection of the perturbation. We show that sparsity provides valuable insight\ninto neural networks in multiple ways: for instance, it illustrates important\ndifferences between current state-of-the-art robust models them that accuracy\nanalysis does not, and suggests approaches for improving their robustness. When\napplying broken defenses effective against weak attacks but not strong ones,\nsparsity can discriminate between the totally ineffective and the partially\neffective defenses. Finally, with sparsity we can measure increases in\nrobustness that do not affect accuracy: we show for example that data\naugmentation can by itself increase adversarial robustness, without using\nadversarial training.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2207.04129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02916v2","updated":"2023-08-10T18:10:13Z","published":"2023-08-05T16:21:12Z","title":"Adversarial Erasing with Pruned Elements: Towards Better Graph Lottery\n  Ticket","summary":"  Graph Lottery Ticket (GLT), a combination of core subgraph and sparse\nsubnetwork, has been proposed to mitigate the computational cost of deep Graph\nNeural Networks (GNNs) on large input graphs while preserving original\nperformance. However, the winning GLTs in exisiting studies are obtained by\napplying iterative magnitude-based pruning (IMP) without re-evaluating and\nre-considering the pruned information, which disregards the dynamic changes in\nthe significance of edges/weights during graph/model structure pruning, and\nthus limits the appeal of the winning tickets. In this paper, we formulate a\nconjecture, i.e., existing overlooked valuable information in the pruned graph\nconnections and model parameters which can be re-grouped into GLT to enhance\nthe final performance. Specifically, we propose an adversarial complementary\nerasing (ACE) framework to explore the valuable information from the pruned\ncomponents, thereby developing a more powerful GLT, referred to as the ACE-GLT.\nThe main idea is to mine valuable information from pruned edges/weights after\neach round of IMP, and employ the ACE technique to refine the GLT processing.\nFinally, experimental results demonstrate that our ACE-GLT outperforms existing\nmethods for searching GLT in diverse tasks. Our code will be made publicly\navailable.\n","authors":["Yuwen Wang","Shunyu Liu","Kaixuan Chen","Tongtian Zhu","Ji Qiao","Mengjie Shi","Yuanyu Wan","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2308.02916v2.pdf","comment":"17 pages, 10 figures, Accept by ECAI2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.05734v1","updated":"2023-08-10T17:55:13Z","published":"2023-08-10T17:55:13Z","title":"AudioLDM 2: Learning Holistic Audio Generation with Self-supervised\n  Pretraining","summary":"  Although audio generation shares commonalities across different types of\naudio, such as speech, music, and sound effects, designing models for each type\nrequires careful consideration of specific objectives and biases that can\nsignificantly differ from those of other types. To bring us closer to a unified\nperspective of audio generation, this paper proposes a framework that utilizes\nthe same learning method for speech, music, and sound effect generation. Our\nframework introduces a general representation of audio, called language of\naudio (LOA). Any audio can be translated into LOA based on AudioMAE, a\nself-supervised pre-trained representation learning model. In the generation\nprocess, we translate any modalities into LOA by using a GPT-2 model, and we\nperform self-supervised audio generation learning with a latent diffusion model\nconditioned on LOA. The proposed framework naturally brings advantages such as\nin-context learning abilities and reusable self-supervised pretrained AudioMAE\nand latent diffusion models. Experiments on the major benchmarks of\ntext-to-audio, text-to-music, and text-to-speech demonstrate new\nstate-of-the-art or competitive performance to previous approaches. Our demo\nand code are available at https://audioldm.github.io/audioldm2.\n","authors":["Haohe Liu","Qiao Tian","Yi Yuan","Xubo Liu","Xinhao Mei","Qiuqiang Kong","Yuping Wang","Wenwu Wang","Yuxuan Wang","Mark D. Plumbley"],"pdf_url":"https://arxiv.org/pdf/2308.05734v1.pdf","comment":"AudioLDM 2 project page is https://audioldm.github.io/audioldm2"},{"id":"http://arxiv.org/abs/2308.04132v2","updated":"2023-08-10T17:00:15Z","published":"2023-08-08T08:43:18Z","title":"Optimizing Adaptive Video Streaming with Human Feedback","summary":"  Quality of Experience~(QoE)-driven adaptive bitrate (ABR) algorithms are\ntypically optimized using QoE models that are based on the mean opinion\nscore~(MOS), while such principles may not account for user heterogeneity on\nrating scales, resulting in unexpected behaviors. In this paper, we propose\nJade, which leverages reinforcement learning with human feedback~(RLHF)\ntechnologies to better align the users' opinion scores. Jade's rank-based QoE\nmodel considers relative values of user ratings to interpret the subjective\nperception of video sessions. We implement linear-based and Deep Neural Network\n(DNN)-based architectures for satisfying both accuracy and generalization\nability. We further propose entropy-aware reinforced mechanisms for training\npolicies with the integration of the proposed QoE models. Experimental results\ndemonstrate that Jade performs favorably on conventional metrics, such as\nquality and stall ratio, and improves QoE by 8.09%-38.13% in different network\nconditions, emphasizing the importance of user heterogeneity in QoE modeling\nand the potential of combining linear-based and DNN-based models for\nperformance improvement.\n","authors":["Tianchi Huang","Rui-Xiao Zhang","Chenglei Wu","Lifeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04132v2.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.05428v1","updated":"2023-08-10T08:42:20Z","published":"2023-08-10T08:42:20Z","title":"Speech-Driven 3D Face Animation with Composite and Regional Facial\n  Movements","summary":"  Speech-driven 3D face animation poses significant challenges due to the\nintricacy and variability inherent in human facial movements. This paper\nemphasizes the importance of considering both the composite and regional\nnatures of facial movements in speech-driven 3D face animation. The composite\nnature pertains to how speech-independent factors globally modulate\nspeech-driven facial movements along the temporal dimension. Meanwhile, the\nregional nature alludes to the notion that facial movements are not globally\ncorrelated but are actuated by local musculature along the spatial dimension.\nIt is thus indispensable to incorporate both natures for engendering vivid\nanimation. To address the composite nature, we introduce an adaptive modulation\nmodule that employs arbitrary facial movements to dynamically adjust\nspeech-driven facial movements across frames on a global scale. To accommodate\nthe regional nature, our approach ensures that each constituent of the facial\nfeatures for every frame focuses on the local spatial movements of 3D faces.\nMoreover, we present a non-autoregressive backbone for translating audio to 3D\nfacial movements, which maintains high-frequency nuances of facial movements\nand facilitates efficient inference. Comprehensive experiments and user studies\ndemonstrate that our method surpasses contemporary state-of-the-art approaches\nboth qualitatively and quantitatively.\n","authors":["Haozhe Wu","Songtao Zhou","Jia Jia","Junliang Xing","Qi Wen","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05428v1.pdf","comment":"Accepted by MM 2023, 9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05421v1","updated":"2023-08-10T08:29:36Z","published":"2023-08-10T08:29:36Z","title":"Progressive Spatio-temporal Perception for Audio-Visual Question\n  Answering","summary":"  Audio-Visual Question Answering (AVQA) task aims to answer questions about\ndifferent visual objects, sounds, and their associations in videos. Such\nnaturally multi-modal videos are composed of rich and complex dynamic\naudio-visual components, where most of which could be unrelated to the given\nquestions, or even play as interference in answering the content of interest.\nOppositely, only focusing on the question-aware audio-visual content could get\nrid of influence, meanwhile enabling the model to answer more efficiently. In\nthis paper, we propose a Progressive Spatio-Temporal Perception Network\n(PSTP-Net), which contains three modules that progressively identify key\nspatio-temporal regions w.r.t. questions. Specifically, a temporal segment\nselection module is first introduced to select the most relevant audio-visual\nsegments related to the given question. Then, a spatial region selection module\nis utilized to choose the most relevant regions associated with the question\nfrom the selected temporal segments. To further refine the selection of\nfeatures, an audio-guided visual attention module is employed to perceive the\nassociation between auido and selected spatial regions. Finally, the\nspatio-temporal features from these modules are integrated for answering the\nquestion. Extensive experimental results on the public MUSIC-AVQA and AVQA\ndatasets provide compelling evidence of the effectiveness and efficiency of\nPSTP-Net. Code is available at:\n\\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}\n","authors":["Guangyao Li","Wenxuan Hou","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2308.05421v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2207.03190v2","updated":"2023-08-10T08:06:05Z","published":"2022-07-07T09:44:44Z","title":"Learning Music-Dance Representations through Explicit-Implicit Rhythm\n  Synchronization","summary":"  Although audio-visual representation has been proved to be applicable in many\ndownstream tasks, the representation of dancing videos, which is more specific\nand always accompanied by music with complex auditory contents, remains\nchallenging and uninvestigated. Considering the intrinsic alignment between the\ncadent movement of dancer and music rhythm, we introduce MuDaR, a novel\nMusic-Dance Representation learning framework to perform the synchronization of\nmusic and dance rhythms both in explicit and implicit ways. Specifically, we\nderive the dance rhythms based on visual appearance and motion cues inspired by\nthe music rhythm analysis. Then the visual rhythms are temporally aligned with\nthe music counterparts, which are extracted by the amplitude of sound\nintensity. Meanwhile, we exploit the implicit coherence of rhythms implied in\naudio and visual streams by contrastive learning. The model learns the joint\nembedding by predicting the temporal consistency between audio-visual pairs.\nThe music-dance representation, together with the capability of detecting audio\nand visual rhythms, can further be applied to three downstream tasks: (a) dance\nclassification, (b) music-dance retrieval, and (c) music-dance retargeting.\nExtensive experiments demonstrate that our proposed framework outperforms other\nself-supervised methods by a large margin.\n","authors":["Jiashuo Yu","Junfu Pu","Ying Cheng","Rui Feng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2207.03190v2.pdf","comment":"Accepted for publication in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2304.02970v3","updated":"2023-08-10T04:08:44Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Semantic Segmentation","summary":"  Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03463v3","updated":"2023-08-10T02:26:16Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03463v3.pdf","comment":"9 pages, 6 figures"}]},"2023-08-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.06259v1","updated":"2023-08-11T17:47:54Z","published":"2023-08-11T17:47:54Z","title":"Self-Alignment with Instruction Backtranslation","summary":"  We present a scalable method to build a high quality instruction following\nlanguage model by automatically labelling human-written text with corresponding\ninstructions. Our approach, named instruction backtranslation, starts with a\nlanguage model finetuned on a small amount of seed data, and a given web\ncorpus. The seed model is used to construct training examples by generating\ninstruction prompts for web documents (self-augmentation), and then selecting\nhigh quality examples from among these candidates (self-curation). This data is\nthen used to finetune a stronger model. Finetuning LLaMa on two iterations of\nour approach yields a model that outperforms all other LLaMa-based models on\nthe Alpaca leaderboard not relying on distillation data, demonstrating highly\neffective self-alignment.\n","authors":["Xian Li","Ping Yu","Chunting Zhou","Timo Schick","Luke Zettlemoyer","Omer Levy","Jason Weston","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2308.06259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":"  By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2305.10615v2","updated":"2023-08-11T17:39:21Z","published":"2023-05-18T00:01:27Z","title":"ML-SUPERB: Multilingual Speech Universal PERformance Benchmark","summary":"  Speech processing Universal PERformance Benchmark (SUPERB) is a leaderboard\nto benchmark the performance of Self-Supervised Learning (SSL) models on\nvarious speech processing tasks. However, SUPERB largely considers English\nspeech in its evaluation. This paper presents multilingual SUPERB (ML-SUPERB),\ncovering 143 languages (ranging from high-resource to endangered), and\nconsidering both automatic speech recognition and language identification.\nFollowing the concept of SUPERB, ML-SUPERB utilizes frozen SSL features and\nemploys a simple framework for multilingual tasks by learning a shallow\ndownstream model. Similar to the SUPERB benchmark, we find speech SSL models\ncan significantly improve performance compared to FBANK features. Furthermore,\nwe find that multilingual models do not always perform better than their\nmonolingual counterparts. We will release ML-SUPERB as a challenge with\norganized datasets and reproducible training scripts for future multilingual\nrepresentation research.\n","authors":["Jiatong Shi","Dan Berrebbi","William Chen","Ho-Lam Chung","En-Pei Hu","Wei Ping Huang","Xuankai Chang","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2305.10615v2.pdf","comment":"Accepted by Interspeech"},{"id":"http://arxiv.org/abs/2308.06235v1","updated":"2023-08-11T17:08:14Z","published":"2023-08-11T17:08:14Z","title":"KETM:A Knowledge-Enhanced Text Matching method","summary":"  Text matching is the task of matching two texts and determining the\nrelationship between them, which has extensive applications in natural language\nprocessing tasks such as reading comprehension, and Question-Answering systems.\nThe mainstream approach is to compute text representations or to interact with\nthe text through attention mechanism, which is effective in text matching\ntasks. However, the performance of these models is insufficient for texts that\nrequire commonsense knowledge-based reasoning. To this end, in this paper, We\nintroduce a new model for text matching called the Knowledge Enhanced Text\nMatching model (KETM), to enrich contextual representations with real-world\ncommon-sense knowledge from external knowledge sources to enhance our model\nunderstanding and reasoning. First, we use Wiktionary to retrieve the text word\ndefinitions as our external knowledge. Secondly, we feed text and knowledge to\nthe text matching module to extract their feature vectors. The text matching\nmodule is used as an interaction module by integrating the encoder layer, the\nco-attention layer, and the aggregation layer. Specifically, the interaction\nprocess is iterated several times to obtain in-depth interaction information\nand extract the feature vectors of text and knowledge by multi-angle pooling.\nThen, we fuse text and knowledge using a gating mechanism to learn the ratio of\ntext and knowledge fusion by a neural network that prevents noise generated by\nknowledge. After that, experimental validation on four datasets are carried\nout, and the experimental results show that our proposed model performs well on\nall four datasets, and the performance of our method is improved compared to\nthe base model without adding external knowledge, which validates the\neffectiveness of our proposed method. The code is available at\nhttps://github.com/1094701018/KETM\n","authors":["Kexin Jiang","Yahui Zhao","Guozhe Jin","Zhenguo Zhang","Rongyi Cui"],"pdf_url":"https://arxiv.org/pdf/2308.06235v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2308.06212v1","updated":"2023-08-11T16:30:44Z","published":"2023-08-11T16:30:44Z","title":"A Large Language Model Enhanced Conversational Recommender System","summary":"  Conversational recommender systems (CRSs) aim to recommend high-quality items\nto users through a dialogue interface. It usually contains multiple sub-tasks,\nsuch as user preference elicitation, recommendation, explanation, and item\ninformation search. To develop effective CRSs, there are some challenges: 1)\nhow to properly manage sub-tasks; 2) how to effectively solve different\nsub-tasks; and 3) how to correctly generate responses that interact with users.\nRecently, Large Language Models (LLMs) have exhibited an unprecedented ability\nto reason and generate, presenting a new opportunity to develop more powerful\nCRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to\naddress the above challenges. For sub-task management, we leverage the\nreasoning ability of LLM to effectively manage sub-task. For sub-task solving,\nwe collaborate LLM with expert models of different sub-tasks to achieve the\nenhanced performance. For response generation, we utilize the generation\nability of LLM as a language interface to better interact with users.\nSpecifically, LLMCRS divides the workflow into four stages: sub-task detection,\nmodel matching, sub-task execution, and response generation. LLMCRS also\ndesigns schema-based instruction, demonstration-based instruction, dynamic\nsub-task and model matching, and summary-based generation to instruct LLM to\ngenerate desired results in the workflow. Finally, to adapt LLM to\nconversational recommendations, we also propose to fine-tune LLM with\nreinforcement learning from CRSs performance feedback, referred to as RLPF.\nExperimental results on benchmark datasets show that LLMCRS with RLPF\noutperforms the existing methods.\n","authors":["Yue Feng","Shuchang Liu","Zhenghai Xue","Qingpeng Cai","Lantao Hu","Peng Jiang","Kun Gai","Fei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06207v1","updated":"2023-08-11T16:13:04Z","published":"2023-08-11T16:13:04Z","title":"Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning\n  to boost Foundation Modals","summary":"  Reasoning ability is one of the most crucial capabilities of a foundation\nmodel, signifying its capacity to address complex reasoning tasks.\nChain-of-Thought (CoT) technique is widely regarded as one of the effective\nmethods for enhancing the reasoning ability of foundation models and has\ngarnered significant attention. However, the reasoning process of CoT is\nlinear, step-by-step, similar to personal logical reasoning, suitable for\nsolving general and slightly complicated problems. On the contrary, the\nthinking pattern of an expert owns two prominent characteristics that cannot be\nhandled appropriately in CoT, i.e., high-order multi-hop reasoning and\nmultimodal comparative judgement. Therefore, the core motivation of this paper\nis transcending CoT to construct a reasoning paradigm that can think like an\nexpert. The hyperedge of a hypergraph could connect various vertices, making it\nnaturally suitable for modelling high-order relationships. Inspired by this,\nthis paper innovatively proposes a multimodal Hypergraph-of-Thought (HoT)\nreasoning paradigm, which enables the foundation models to possess the\nexpert-level ability of high-order multi-hop reasoning and multimodal\ncomparative judgement. Specifically, a textual hypergraph-of-thought is\nconstructed utilizing triple as the primary thought to model higher-order\nrelationships, and a hyperedge-of-thought is generated through multi-hop\nwalking paths to achieve multi-hop inference. Furthermore, we devise a visual\nhypergraph-of-thought to interact with the textual hypergraph-of-thought via\nCross-modal Co-Attention Graph Learning for multimodal comparative\nverification. Experimentations on the ScienceQA benchmark demonstrate the\nproposed HoT-based T5 outperforms CoT-based GPT3.5 and chatGPT, which is on par\nwith CoT-based GPT4 with a lower model size.\n","authors":["Fanglong Yao","Changyuan Tian","Jintao Liu","Zequn Zhang","Qing Liu","Li Jin","Shuchao Li","Xiaoyu Li","Xian Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06199v1","updated":"2023-08-11T15:47:49Z","published":"2023-08-11T15:47:49Z","title":"Weakly Supervised Text Classification on Free Text Comments in\n  Patient-Reported Outcome Measures","summary":"  Free text comments (FTC) in patient-reported outcome measures (PROMs) data\nare typically analysed using manual methods, such as content analysis, which is\nlabour-intensive and time-consuming. Machine learning analysis methods are\nlargely unsupervised, necessitating post-analysis interpretation. Weakly\nsupervised text classification (WSTC) can be a valuable method of analysis to\nclassify domain-specific text data in which there is limited labelled data. In\nthis paper, we apply five WSTC techniques to FTC in PROMs data to identify\nhealth-related quality of life (HRQoL) themes reported by colorectal cancer\npatients. The WSTC methods label all the themes mentioned in the FTC. The\nresults showed moderate performance on the PROMs data, mainly due to the\nprecision of the models, and variation between themes. Evaluation of the\nclassification performance illustrated the potential and limitations of keyword\nbased WSTC to label PROMs FTC when labelled data is limited.\n","authors":["Anna-Grace Linton","Vania Dimitrova","Amy Downing","Richard Wagland","Adam Glaser"],"pdf_url":"https://arxiv.org/pdf/2308.06199v1.pdf","comment":"Accepted and presented at Health Text Analytics conference 2023 (UK)"},{"id":"http://arxiv.org/abs/2308.04255v2","updated":"2023-08-11T15:24:37Z","published":"2023-08-08T13:41:41Z","title":"CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic\n  Languages","summary":"  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of\nthe South Slavic languages, which is based on the Stanza natural language\nprocessing pipeline. We describe the main improvements in CLASSLA-Stanza with\nrespect to Stanza, and give a detailed description of the model training\nprocess for the latest 2.1 release of the pipeline. We also report performance\nscores produced by the pipeline for different languages and varieties.\nCLASSLA-Stanza exhibits consistently high performance across all the supported\nlanguages and outperforms or expands its parent pipeline Stanza at all the\nsupported tasks. We also present the pipeline's new functionality enabling\nefficient processing of web data and the reasons that led to its\nimplementation.\n","authors":["Luka Terčon","Nikola Ljubešić"],"pdf_url":"https://arxiv.org/pdf/2308.04255v2.pdf","comment":"17 pages, 14 tables, 1 figure; Typos corrected"},{"id":"http://arxiv.org/abs/2308.06175v1","updated":"2023-08-11T15:04:34Z","published":"2023-08-11T15:04:34Z","title":"Assessing Guest Nationality Composition from Hotel Reviews","summary":"  Many hotels target guest acquisition efforts to specific markets in order to\nbest anticipate individual preferences and needs of their guests. Likewise,\nsuch strategic positioning is a prerequisite for efficient marketing budget\nallocation. Official statistics report on the number of visitors from different\ncountries, but no fine-grained information on the guest composition of\nindividual businesses exists. There is, however, growing interest in such data\nfrom competitors, suppliers, researchers and the general public. We demonstrate\nhow machine learning can be leveraged to extract references to guest\nnationalities from unstructured text reviews in order to dynamically assess and\nmonitor the dynamics of guest composition of individual businesses. In\nparticular, we show that a rather simple architecture of pre-trained embeddings\nand stacked LSTM layers provides a better performance-runtime tradeoff than\nmore complex state-of-the-art language models.\n","authors":["Fabian Gröger","Marc Pouly","Flavia Tinner","Leif Brandes"],"pdf_url":"https://arxiv.org/pdf/2308.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06165v1","updated":"2023-08-11T14:47:27Z","published":"2023-08-11T14:47:27Z","title":"Task Conditioned BERT for Joint Intent Detection and Slot-filling","summary":"  Dialogue systems need to deal with the unpredictability of user intents to\ntrack dialogue state and the heterogeneity of slots to understand user\npreferences. In this paper we investigate the hypothesis that solving these\nchallenges as one unified model will allow the transfer of parameter support\ndata across the different tasks. The proposed principled model is based on a\nTransformer encoder, trained on multiple tasks, and leveraged by a rich input\nthat conditions the model on the target inferences. Conditioning the\nTransformer encoder on multiple target inferences over the same corpus, i.e.,\nintent and multiple slot types, allows learning richer language interactions\nthan a single-task model would be able to. In fact, experimental results\ndemonstrate that conditioning the model on an increasing number of dialogue\ninference tasks leads to improved results: on the MultiWOZ dataset, the joint\nintent and slot detection can be improved by 3.2\\% by conditioning on intent,\n10.8\\% by conditioning on slot and 14.4\\% by conditioning on both intent and\nslots. Moreover, on real conversations with Farfetch costumers, the proposed\nconditioned BERT can achieve high joint-goal and intent detection performance\nthroughout a dialogue.\n","authors":["Diogo Tavares","Pedro Azevedo","David Semedo","Ricardo Sousa","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2308.06165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14704v2","updated":"2023-08-11T14:17:56Z","published":"2023-06-26T13:54:47Z","title":"Ontology Enrichment from Texts: A Biomedical Dataset for Concept\n  Discovery and Placement","summary":"  Mentions of new concepts appear regularly in texts and require automated\napproaches to harvest and place them into Knowledge Bases (KB), e.g.,\nontologies and taxonomies. Existing datasets suffer from three issues, (i)\nmostly assuming that a new concept is pre-discovered and cannot support\nout-of-KB mention discovery; (ii) only using the concept label as the input\nalong with the KB and thus lacking the contexts of a concept label; and (iii)\nmostly focusing on concept placement w.r.t a taxonomy of atomic concepts,\ninstead of complex concepts, i.e., with logical operators. To address these\nissues, we propose a new benchmark, adapting MedMentions dataset (PubMed\nabstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases\nsub-category and the broader categories of Clinical finding, Procedure, and\nPharmaceutical / biologic product. We provide usage on the evaluation with the\ndataset for out-of-KB mention discovery and concept placement, adapting recent\nLarge Language Model based methods.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2306.14704v2.pdf","comment":"5 pages, 1 figure, accepted for CIKM 2023. The dataset, data\n  construction scripts, and baseline implementation are available at\n  https://zenodo.org/record/8228005 (Zenodo) and\n  https://github.com/KRR-Oxford/OET (GitHub)"},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n  and Transformer Based Models","summary":"  The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07189v3","updated":"2023-08-11T13:57:42Z","published":"2023-02-14T17:00:06Z","title":"Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity\n  Linking","summary":"  Discovering entity mentions that are out of a Knowledge Base (KB) from texts\nplays a critical role in KB maintenance, but has not yet been fully explored.\nThe current methods are mostly limited to the simple threshold-based approach\nand feature-based classification, and the datasets for evaluation are\nrelatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL)\nmethod which can identify mentions that do not have corresponding KB entities\nby matching them to a special NIL entity. To better utilize BERT, we propose\nnew techniques including NIL entity representation and classification, with\nsynonym enhancement. We also apply KB Pruning and Versioning strategies to\nautomatically construct out-of-KB datasets from common in-KB EL datasets.\nResults on five datasets of clinical notes, biomedical publications, and\nWikipedia articles in various domains show the advantages of BLINKout over\nexisting methods to identify out-of-KB mentions for the medical ontologies,\nUMLS, SNOMED CT, and the general KB, WikiData.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Yinan Liu","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2302.07189v3.pdf","comment":"11 pages, 3 figures, accepted for CIKM 2023"},{"id":"http://arxiv.org/abs/2302.14057v2","updated":"2023-08-11T13:48:44Z","published":"2023-02-25T10:12:34Z","title":"Cross-modal Contrastive Learning for Multimodal Fake News Detection","summary":"  Automatic detection of multimodal fake news has gained a widespread attention\nrecently. Many existing approaches seek to fuse unimodal features to produce\nmultimodal news representations. However, the potential of powerful cross-modal\ncontrastive learning methods for fake news detection has not been well\nexploited. Besides, how to aggregate features from different modalities to\nboost the performance of the decision-making process is still an open question.\nTo address that, we propose COOLANT, a cross-modal contrastive learning\nframework for multimodal fake news detection, aiming to achieve more accurate\nimage-text alignment. To further improve the alignment precision, we leverage\nan auxiliary task to soften the loss term of negative samples during the\ncontrast process. A cross-modal fusion module is developed to learn the\ncross-modality correlations. An attention mechanism with an attention guidance\nmodule is implemented to help effectively and interpretably aggregate the\naligned unimodal representations and the cross-modality correlations. Finally,\nwe evaluate the COOLANT and conduct a comparative study on two widely used\ndatasets, Twitter and Weibo. The experimental results demonstrate that our\nCOOLANT outperforms previous approaches by a large margin and achieves new\nstate-of-the-art results on the two datasets.\n","authors":["Longzheng Wang","Chuang Zhang","Hongbo Xu","Yongxiu Xu","Xiaohan Xu","Siqi Wang"],"pdf_url":"https://arxiv.org/pdf/2302.14057v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.06125v1","updated":"2023-08-11T13:28:48Z","published":"2023-08-11T13:28:48Z","title":"Improving Joint Speech-Text Representations Without Alignment","summary":"  The last year has seen astonishing progress in text-prompted image generation\npremised on the idea of a cross-modal representation space in which the text\nand image domains are represented jointly. In ASR, this idea has found\napplication as joint speech-text encoders that can scale to the capacities of\nvery large parameter models by being trained on both unpaired speech and text.\nWhile these methods show promise, they have required special treatment of the\nsequence-length mismatch inherent in speech and text, either by up-sampling\nheuristics or an explicit alignment model. In this work, we offer evidence that\njoint speech-text encoders naturally achieve consistent representations across\nmodalities by disregarding sequence length, and argue that consistency losses\ncould forgive length differences and simply assume the best alignment. We show\nthat such a loss improves downstream WER in both a large-parameter monolingual\nand multilingual system.\n","authors":["Cal Peyser","Zhong Meng","Ke Hu","Rohit Prabhavalkar","Andrew Rosenberg","Tara N. Sainath","Michael Picheny","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2308.06125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06112v1","updated":"2023-08-11T12:59:02Z","published":"2023-08-11T12:59:02Z","title":"Lip2Vec: Efficient and Robust Visual Speech Recognition via\n  Latent-to-Latent Visual to Audio Representation Mapping","summary":"  Visual Speech Recognition (VSR) differs from the common perception tasks as\nit requires deeper reasoning over the video sequence, even by human experts.\nDespite the recent advances in VSR, current approaches rely on labeled data to\nfully train or finetune their models predicting the target speech. This hinders\ntheir ability to generalize well beyond the training set and leads to\nperformance degeneration under out-of-distribution challenging scenarios.\nUnlike previous works that involve auxiliary losses or complex training\nprocedures and architectures, we propose a simple approach, named Lip2Vec that\nis based on learning a prior model. Given a robust visual speech encoder, this\nnetwork maps the encoded latent representations of the lip sequence to their\ncorresponding latents from the audio pair, which are sufficiently invariant for\neffective text decoding. The generated audio representation is then decoded to\ntext using an off-the-shelf Audio Speech Recognition (ASR) model. The proposed\nmodel compares favorably with fully-supervised learning methods on the LRS3\ndataset achieving 26 WER. Unlike SoTA approaches, our model keeps a reasonable\nperformance on the VoxCeleb test set. We believe that reprogramming the VSR as\nan ASR task narrows the performance gap between the two and paves the way for\nmore flexible formulations of lip reading.\n","authors":["Yasser Abdelaziz Dahou Djilali","Sanath Narayan","Haithem Boussaid","Ebtessam Almazrouei","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06111v1","updated":"2023-08-11T12:55:09Z","published":"2023-08-11T12:55:09Z","title":"Improving Zero-Shot Text Matching for Financial Auditing with Large\n  Language Models","summary":"  Auditing financial documents is a very tedious and time-consuming process. As\nof today, it can already be simplified by employing AI-based solutions to\nrecommend relevant text passages from a report for each legal requirement of\nrigorous accounting standards. However, these methods need to be fine-tuned\nregularly, and they require abundant annotated data, which is often lacking in\nindustrial environments. Hence, we present ZeroShotALI, a novel recommender\nsystem that leverages a state-of-the-art large language model (LLM) in\nconjunction with a domain-specifically optimized transformer-based\ntext-matching solution. We find that a two-step approach of first retrieving a\nnumber of best matching document sections per legal requirement with a custom\nBERT-based model and second filtering these selections using an LLM yields\nsignificant performance improvements over existing approaches.\n","authors":["Lars Hillebrand","Armin Berger","Tobias Deußer","Tim Dilmaghani","Mohamed Khaled","Bernd Kliem","Rüdiger Loitz","Maren Pielka","David Leonhard","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.06111v1.pdf","comment":"4 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2308.03043v2","updated":"2023-08-11T12:07:52Z","published":"2023-08-06T07:59:12Z","title":"3D-EX : A Unified Dataset of Definitions and Dictionary Examples","summary":"  Definitions are a fundamental building block in lexicography, linguistics and\ncomputational semantics. In NLP, they have been used for retrofitting word\nembeddings or augmenting contextual representations in language models.\nHowever, lexical resources containing definitions exhibit a wide range of\nproperties, which has implications in the behaviour of models trained and\nevaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to\nfill this gap by combining well-known English resources into one centralized\nknowledge repository in the form of <term, definition, example> triples. 3D- EX\nis a unified evaluation framework with carefully pre-computed\ntrain/validation/test splits to prevent memorization. We report experimental\nresults that suggest that this dataset could be effectively leveraged in\ndownstream NLP tasks. Code and data are available at\nhttps://github.com/F-Almeman/3D-EX .\n","authors":["Fatemah Almeman","Hadi Sheikhi","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03043v2.pdf","comment":"11 pages (including references pages), 9 tables, and 1 figure. This\n  paper is submitted to RANLP2023"},{"id":"http://arxiv.org/abs/2308.06095v1","updated":"2023-08-11T12:07:45Z","published":"2023-08-11T12:07:45Z","title":"Neural Conversation Models and How to Rein Them in: A Survey of Failures\n  and Fixes","summary":"  Recent conditional language models are able to continue any kind of text\nsource in an often seemingly fluent way. This fact encouraged research in the\narea of open-domain conversational systems that are based on powerful language\nmodels and aim to imitate an interlocutor by generating appropriate\ncontributions to a written dialogue. From a linguistic perspective, however,\nthe complexity of contributing to a conversation is high. In this survey, we\ninterpret Grice's maxims of cooperative conversation from the perspective of\nthis specific research area and systematize the literature under the aspect of\nwhat makes a contribution appropriate: A neural conversation model has to be\nfluent, informative, consistent, coherent, and follow social norms. In order to\nensure these qualities, recent approaches try to tame the underlying language\nmodels at various intervention points, such as data, training regime or\ndecoding. Sorted by these categories and intervention points, we discuss\npromising attempts and suggest novel ways for future research.\n","authors":["Fabian Galetzka","Anne Beyer","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2308.06095v1.pdf","comment":"Represents the state of the field in 2022; partially based on the\n  first authors 2022 PhD thesis"},{"id":"http://arxiv.org/abs/2308.06077v1","updated":"2023-08-11T11:29:51Z","published":"2023-08-11T11:29:51Z","title":"Fly-Swat or Cannon? Cost-Effective Language Model Choice via\n  Meta-Modeling","summary":"  Generative language models (LMs) have become omnipresent across data science.\nFor a wide variety of tasks, inputs can be phrased as natural language prompts\nfor an LM, from whose output the solution can then be extracted. LM performance\nhas consistently been increasing with model size - but so has the monetary cost\nof querying the ever larger models. Importantly, however, not all inputs are\nequally hard: some require larger LMs for obtaining a satisfactory solution,\nwhereas for others smaller LMs suffice. Based on this fact, we design a\nframework for Cost-Effective Language Model Choice (CELMOC). Given a set of\ninputs and a set of candidate LMs, CELMOC judiciously assigns each input to an\nLM predicted to do well on the input according to a so-called meta-model,\naiming to achieve high overall performance at low cost. The cost-performance\ntrade-off can be flexibly tuned by the user. Options include, among others,\nmaximizing total expected performance (or the number of processed inputs) while\nstaying within a given cost budget, or minimizing total cost while processing\nall inputs. We evaluate CELMOC on 14 datasets covering five natural language\ntasks, using four candidate LMs of vastly different size and cost. With CELMOC,\nwe match the performance of the largest available LM while achieving a cost\nreduction of 63%. Via our publicly available library, researchers as well as\npractitioners can thus save large amounts of money without sacrificing\nperformance.\n","authors":["Marija Šakota","Maxime Peyrard","Robert West"],"pdf_url":"https://arxiv.org/pdf/2308.06077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16878v3","updated":"2023-08-11T11:25:49Z","published":"2022-11-30T10:25:24Z","title":"Transformers are Short Text Classifiers: A Study of Inductive Short Text\n  Classifiers on Benchmarks and Real-world Datasets","summary":"  Short text classification is a crucial and challenging aspect of Natural\nLanguage Processing. For this reason, there are numerous highly specialized\nshort text classifiers. However, in recent short text research, State of the\nArt (SOTA) methods for traditional text classification, particularly the pure\nuse of Transformers, have been unexploited. In this work, we examine the\nperformance of a variety of short text classifiers as well as the top\nperforming traditional text classifier. We further investigate the effects on\ntwo new real-world short text datasets in an effort to address the issue of\nbecoming overly dependent on benchmark datasets with a limited number of\ncharacteristics. Our experiments unambiguously demonstrate that Transformers\nachieve SOTA accuracy on short text classification tasks, raising the question\nof whether specialized short text techniques are necessary.\n","authors":["Fabian Karl","Ansgar Scherp"],"pdf_url":"https://arxiv.org/pdf/2211.16878v3.pdf","comment":"Accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2106.07306v6","updated":"2023-08-11T10:46:29Z","published":"2021-06-14T11:23:59Z","title":"Constraining Linear-chain CRFs to Regular Languages","summary":"  A major challenge in structured prediction is to represent the\ninterdependencies within output structures. When outputs are structured as\nsequences, linear-chain conditional random fields (CRFs) are a widely used\nmodel class which can learn \\textit{local} dependencies in the output. However,\nthe CRF's Markov assumption makes it impossible for CRFs to represent\ndistributions with \\textit{nonlocal} dependencies, and standard CRFs are unable\nto respect nonlocal constraints of the data (such as global arity constraints\non output labels). We present a generalization of CRFs that can enforce a broad\nclass of constraints, including nonlocal ones, by specifying the space of\npossible output structures as a regular language $\\mathcal{L}$. The resulting\nregular-constrained CRF (RegCCRF) has the same formal properties as a standard\nCRF, but assigns zero probability to all label sequences not in $\\mathcal{L}$.\nNotably, RegCCRFs can incorporate their constraints during training, while\nrelated models only enforce constraints during decoding. We prove that\nconstrained training is never worse than constrained decoding, and show\nempirically that it can be substantially better in practice. Additionally, we\ndemonstrate a practical benefit on downstream tasks by incorporating a RegCCRF\ninto a deep neural model for semantic role labeling, exceeding state-of-the-art\nresults on a standard dataset.\n","authors":["Sean Papay","Roman Klinger","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2106.07306v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06063v1","updated":"2023-08-11T10:35:53Z","published":"2023-08-11T10:35:53Z","title":"A Case Study on Context Encoding in Multi-Encoder based Document-Level\n  Neural Machine Translation","summary":"  Recent studies have shown that the multi-encoder models are agnostic to the\nchoice of context, and the context encoder generates noise which helps improve\nthe models in terms of BLEU score. In this paper, we further explore this idea\nby evaluating with context-aware pronoun translation test set by training\nmulti-encoder models trained on three different context settings viz, previous\ntwo sentences, random two sentences, and a mix of both as context.\nSpecifically, we evaluate the models on the ContraPro test set to study how\ndifferent contexts affect pronoun translation accuracy. The results show that\nthe model can perform well on the ContraPro test set even when the context is\nrandom. We also analyze the source representations to study whether the context\nencoder generates noise. Our analysis shows that the context encoder provides\nsufficient information to learn discourse-level information. Additionally, we\nobserve that mixing the selected context (the previous two sentences in this\ncase) and the random context is generally better than the other settings.\n","authors":["Ramakrishna Appicharla","Baban Gain","Santanu Pal","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.06063v1.pdf","comment":"Accepted to MT Summit 2023 (oral)"},{"id":"http://arxiv.org/abs/2303.16618v2","updated":"2023-08-11T10:01:35Z","published":"2023-03-29T12:19:23Z","title":"Personalised Language Modelling of Screen Characters Using Rich Metadata\n  Annotations","summary":"  Language models that are sensitive to external context can more effectively\ncapture the speaking patterns of individuals with specific characteristics or\nin particular environments. However, obtaining and leveraging such annotations\ncan be challenging. In this work, we show how to leverage rich character and\nfilm annotations to personalise language models in a scalable manner. Our best\nmodel can reduce perplexity by up to 6.5% compared to a parameter-matched\nlanguage model. Our approach performs on par with speaker-specific fine-tuning\nwhen the fine-tuning data (i.e. past dialogue) for individual speakers is\navailable. On top of that, it also generalises well to a scenario with no such\ndata, relying on combinations of demographic characteristics expressed via\nmetadata. Our findings are consistent across two corpora, one of which is also\na contribution of this paper: Cornell-rich contains rich manual annotations for\n863 speaking characters from the Cornell Movie Dialog Corpus, including\nfeatures such as characteristic quotes and character descriptions, along with\nsix automatically extracted metadata features for over 95% of the featured\nfilms. Finally, we also present a cost-benefit analysis highlighting which\nannotations are most cost-effective in reducing perplexity.\n","authors":["Sebastian Vincent","Rowanne Sumner","Alice Dowek","Charlotte Blundell","Emily Preston","Chris Bayliss","Chris Oakley","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2303.16618v2.pdf","comment":"9 pages; 4 figures; 6 tables. Preprint"},{"id":"http://arxiv.org/abs/2303.08032v2","updated":"2023-08-11T09:59:07Z","published":"2023-03-14T16:11:47Z","title":"Verifying the Robustness of Automatic Credibility Assessment","summary":"  Text classification methods have been widely investigated as a way to detect\ncontent of low credibility: fake news, social media bots, propaganda, etc.\nQuite accurate models (likely based on deep neural networks) help in moderating\npublic electronic platforms and often cause content creators to face rejection\nof their submissions or removal of already published texts. Having the\nincentive to evade further detection, content creators try to come up with a\nslightly modified version of the text (known as an attack with an adversarial\nexample) that exploit the weaknesses of classifiers and result in a different\noutput. Here we systematically test the robustness of popular text classifiers\nagainst available attacking techniques and discover that, indeed, in some cases\ninsignificant changes in input text can mislead the models. We also introduce\nBODEGA: a benchmark for testing both victim models and attack methods on four\nmisinformation detection tasks in an evaluation framework designed to simulate\nreal use-cases of content moderation. Finally, we manually analyse a subset\nadversarial examples and check what kinds of modifications are used in\nsuccessful attacks. The BODEGA code and data is openly shared in hope of\nenhancing the comparability and replicability of further research in this area\n","authors":["Piotr Przybyła","Alexander Shvets","Horacio Saggion"],"pdf_url":"https://arxiv.org/pdf/2303.08032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06039v1","updated":"2023-08-11T09:36:33Z","published":"2023-08-11T09:36:33Z","title":"Learning to Guide Human Experts via Personalized Large Language Models","summary":"  In learning to defer, a predictor identifies risky decisions and defers them\nto a human expert. One key issue with this setup is that the expert may end up\nover-relying on the machine's decisions, due to anchoring bias. At the same\ntime, whenever the machine chooses the deferral option the expert has to take\ndecisions entirely unassisted. As a remedy, we propose learning to guide (LTG),\nan alternative framework in which -- rather than suggesting ready-made\ndecisions -- the machine provides guidance useful to guide decision-making, and\nthe human is entirely responsible for coming up with a decision. We also\nintroduce SLOG, an LTG implementation that leverages (a small amount of) human\nsupervision to convert a generic large language model into a module capable of\ngenerating textual guidance, and present preliminary but promising results on a\nmedical diagnosis task.\n","authors":["Debodeep Banerjee","Stefano Teso","Andrea Passerini"],"pdf_url":"https://arxiv.org/pdf/2308.06039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06035v1","updated":"2023-08-11T09:30:07Z","published":"2023-08-11T09:30:07Z","title":"Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large\n  Language Models During Predictive Language Processing","summary":"  The advanced language processing abilities of large language models (LLMs)\nhave stimulated debate over their capacity to replicate human-like cognitive\nprocesses. One differentiating factor between language processing in LLMs and\nhumans is that language input is often grounded in more than one perceptual\nmodality, whereas most LLMs process solely text-based information. Multimodal\ngrounding allows humans to integrate - e.g. visual context with linguistic\ninformation and thereby place constraints on the space of upcoming words,\nreducing cognitive load and improving perception and comprehension. Recent\nmultimodal LLMs (mLLMs) combine visual and linguistic embedding spaces with a\ntransformer type attention mechanism for next-word prediction. To what extent\ndoes predictive language processing based on multimodal input align in mLLMs\nand humans? To answer this question, 200 human participants watched short\naudio-visual clips and estimated the predictability of an upcoming verb or\nnoun. The same clips were processed by the mLLM CLIP, with predictability\nscores based on a comparison of image and text feature vectors. Eye-tracking\nwas used to estimate what visual features participants attended to, and CLIP's\nvisual attention weights were recorded. We find that human estimates of\npredictability align significantly with CLIP scores, but not for a unimodal LLM\nof comparable parameter size. Further, alignment vanished when CLIP's visual\nattention weights were perturbed, and when the same input was fed to a\nmultimodal model without attention. Analysing attention patterns, we find a\nsignificant spatial overlap between CLIP's visual attention weights and human\neye-tracking data. Results suggest that comparable processes of integrating\nmultimodal information, guided by attention to relevant visual features,\nsupports predictive language processing in mLLMs and humans.\n","authors":["Viktor Kewenig","Christopher Edwards","Quitterie Lacome DEstalenx","Akilles Rechardt","Jeremy I Skipper","Gabriella Vigliocco"],"pdf_url":"https://arxiv.org/pdf/2308.06035v1.pdf","comment":"13 pages, 4 figures, submitted to journal"},{"id":"http://arxiv.org/abs/2308.06032v1","updated":"2023-08-11T09:23:11Z","published":"2023-08-11T09:23:11Z","title":"Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT\n  Replace Lawyers?","summary":"  Large Language Models (LLMs) could enhance access to the legal system.\nHowever, empirical research on their effectiveness in conducting legal tasks is\nscant. We study securities cases involving cryptocurrencies as one of numerous\ncontexts where AI could support the legal process, studying LLMs' legal\nreasoning and drafting capabilities. We examine whether a) an LLM can\naccurately determine which laws are potentially being violated from a fact\npattern, and b) whether there is a difference in juror decision-making based on\ncomplaints written by a lawyer compared to an LLM. We feed fact patterns from\nreal-life cases to GPT-3.5 and evaluate its ability to determine correct\npotential violations from the scenario and exclude spurious violations. Second,\nwe had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's\nlegal reasoning skills proved weak, though we expect improvement in future\nmodels, particularly given the violations it suggested tended to be correct (it\nmerely missed additional, correct violations). GPT-3.5 performed better at\nlegal drafting, and jurors' decisions were not statistically significantly\nassociated with the author of the document upon which they based their\ndecisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks,\nthey would be unable to replace lawyers at this stage. However, their drafting\nskills (though, perhaps, still inferior to lawyers), could provide access to\njustice for more individuals by reducing the cost of legal services. Our\nresearch is the first to systematically study LLMs' legal drafting and\nreasoning capabilities in litigation, as well as in securities law and\ncryptocurrency-related misconduct.\n","authors":["Arianna Trozze","Toby Davies","Bennett Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2308.06032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12267v3","updated":"2023-08-11T09:18:51Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n  Essay in Education","summary":"  The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v3.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2308.06017v1","updated":"2023-08-11T08:47:52Z","published":"2023-08-11T08:47:52Z","title":"Optimizing transformer-based machine translation model for single GPU\n  training: a hyperparameter ablation study","summary":"  In machine translation tasks, the relationship between model complexity and\nperformance is often presumed to be linear, driving an increase in the number\nof parameters and consequent demands for computational resources like multiple\nGPUs. To explore this assumption, this study systematically investigates the\neffects of hyperparameters through ablation on a sequence-to-sequence machine\ntranslation pipeline, utilizing a single NVIDIA A100 GPU. Contrary to\nexpectations, our experiments reveal that combinations with the most parameters\nwere not necessarily the most effective. This unexpected insight prompted a\ncareful reduction in parameter sizes, uncovering \"sweet spots\" that enable\ntraining sophisticated models on a single GPU without compromising translation\nquality. The findings demonstrate an intricate relationship between\nhyperparameter selection, model size, and computational resource needs. The\ninsights from this study contribute to the ongoing efforts to make machine\ntranslation more accessible and cost-effective, emphasizing the importance of\nprecise hyperparameter tuning over mere scaling.\n","authors":["Luv Verma","Ketaki N. Kolhatkar"],"pdf_url":"https://arxiv.org/pdf/2308.06017v1.pdf","comment":"12 pages, 15 figures, 1 Table"},{"id":"http://arxiv.org/abs/2307.03104v3","updated":"2023-08-11T08:12:50Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings Using Adapters","summary":"  Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity (STS) tasks. Therefore, to use sentence embeddings in a particular\ndomain, the model must be adapted to it in order to achieve good results.\nUsually, this is done by fine-tuning the entire sentence embedding model for\nthe domain of interest. While this approach yields state-of-the-art results,\nall of the model's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis N. Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v3.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n  Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.05481v2","updated":"2023-08-11T07:55:19Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":"  Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05973v1","updated":"2023-08-11T07:16:49Z","published":"2023-08-11T07:16:49Z","title":"Tweet Sentiment Extraction using Viterbi Algorithm with Transfer\n  Learning","summary":"  Tweet sentiment extraction extracts the most significant portion of the\nsentence, determining whether the sentiment is positive or negative. This\nresearch aims to identify the part of tweet sentences that strikes any emotion.\nTo reach this objective, we continue improving the Viterbi algorithm previously\nmodified by the author to make it able to receive pre-trained model parameters.\nWe introduce the confidence score and vector as two indicators responsible for\nevaluating the model internally before assessing the final results. We then\npresent a method to fine-tune this nonparametric model. We found that the model\ngets highly explainable as the confidence score vector reveals precisely where\nthe least confidence predicted states are and if the modifications approved\nameliorate the confidence score or if the tuning is going in the wrong\ndirection.\n","authors":["Zied Baklouti"],"pdf_url":"https://arxiv.org/pdf/2308.05973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05935v1","updated":"2023-08-11T04:36:26Z","published":"2023-08-11T04:36:26Z","title":"LittleMu: Deploying an Online Virtual Teaching Assistant via\n  Heterogeneous Sources Integration and Chain of Teach Prompts","summary":"  Teaching assistants have played essential roles in the long history of\neducation. However, few MOOC platforms are providing human or virtual teaching\nassistants to support learning for massive online students due to the\ncomplexity of real-world online education scenarios and the lack of training\ndata. In this paper, we present a virtual MOOC teaching assistant, LittleMu\nwith minimum labeled training data, to provide question answering and chit-chat\nservices. Consisting of two interactive modules of heterogeneous retrieval and\nlanguage model prompting, LittleMu first integrates structural, semi- and\nunstructured knowledge sources to support accurate answers for a wide range of\nquestions. Then, we design delicate demonstrations named \"Chain of Teach\"\nprompts to exploit the large-scale pre-trained model to handle complex\nuncollected questions. Except for question answering, we develop other\neducational services such as knowledge-grounded chit-chat. We test the system's\nperformance via both offline evaluation and online deployment. Since May 2020,\nour LittleMu system has served over 80,000 users with over 300,000 queries from\nover 500 courses on XuetangX MOOC platform, which continuously contributes to a\nmore convenient and fair education. Our code, services, and dataset will be\navailable at https://github.com/THU-KEG/VTA.\n","authors":["Shangqing Tu","Zheyuan Zhang","Jifan Yu","Chunyang Li","Siyu Zhang","Zijun Yao","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.05935v1.pdf","comment":"7 pages, 3 figures, Accepted by CIKM 23"},{"id":"http://arxiv.org/abs/2308.04823v2","updated":"2023-08-11T04:17:33Z","published":"2023-08-09T09:22:56Z","title":"Evaluating the Generation Capabilities of Large Chinese Language Models","summary":"  This paper presents CG-Eval, the first comprehensive evaluation of the\ngeneration capabilities of large Chinese language models across a wide range of\nacademic disciplines. The models' performance was assessed based on their\nability to generate accurate and relevant responses to different types of\nquestions in six disciplines, namely, Science and Engineering, Humanities and\nSocial Sciences, Mathematical Calculations, Medical Practitioner Qualification\nExamination, Judicial Examination, and Certified Public Accountant Examination.\nThis paper also presents Gscore, a composite index derived from the weighted\nsum of multiple metrics to measure the quality of model's generation against a\nreference. The test data and test results can be found at\nhttp://cgeval.besteasy.com/.\n","authors":["Hui Zeng","Jingyuan Xue","Meng Hao","Chen Sun","Bin Ning","Na Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00732v3","updated":"2023-08-11T04:00:59Z","published":"2022-10-28T12:54:30Z","title":"Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia","summary":"  Online encyclopedias, such as Wikipedia, have been well-developed and\nresearched in the last two decades. One can find any attributes or other\ninformation of a wiki item on a wiki page edited by a community of volunteers.\nHowever, the traditional text, images and tables can hardly express some\naspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may\ncare more about ``How to feed it'' or ``How to train it not to protect its\nfood''. Currently, short-video platforms have become a hallmark in the online\nworld. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,\nshort-video apps have changed how we consume and create content today. Except\nfor producing short videos for entertainment, we can find more and more authors\nsharing insightful knowledge widely across all walks of life. These short\nvideos, which we call knowledge videos, can easily express any aspects (e.g.\nhair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and\nthey can be systematically analyzed and organized like an online encyclopedia.\nIn this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia\nconsisting of items, aspects, and short videos lined to them, which was\nextracted from billions of videos of Kuaishou (Kwai), a well-known short-video\nplatform in China. We first collected items from multiple sources and mined\nuser-centered aspects from millions of users' queries to build an item-aspect\ntree. Then we propose a new task called ``multi-modal item-aspect linking'' as\nan expansion of ``entity linking'' to link short videos into item-aspect pairs\nand build the whole short-video encyclopedia. Intrinsic evaluations show that\nour encyclopedia is of large scale and highly accurate. We also conduct\nsufficient extrinsic experiments to show how Kuaipedia can help fundamental\napplications such as entity typing and entity linking.\n","authors":["Haojie Pan","Zepeng Zhai","Yuzhou Zhang","Ruiji Fu","Ming Liu","Yangqiu Song","Zhongyuan Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2211.00732v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v2","updated":"2023-08-11T02:50:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n  Deceptive Text Classification: A Comparative Analysis","summary":"  Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive o fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content.\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.02463v2","updated":"2023-08-11T02:19:33Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":"  In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09927v2","updated":"2023-08-11T02:04:46Z","published":"2023-06-16T15:50:03Z","title":"Trained Transformers Learn Linear Models In-Context","summary":"  Attention-based neural networks such as transformers have demonstrated a\nremarkable ability to exhibit in-context learning (ICL): Given a short prompt\nsequence of tokens from an unseen task, they can formulate relevant per-token\nand next-token predictions without any parameter updates. By embedding a\nsequence of labeled training data and unlabeled test data as a prompt, this\nallows for transformers to behave like supervised learning algorithms. Indeed,\nrecent work has shown that when training transformer architectures over random\ninstances of linear regression problems, these models' predictions mimic those\nof ordinary least squares.\n  Towards understanding the mechanisms underlying this phenomenon, we\ninvestigate the dynamics of ICL in transformers with a single linear\nself-attention layer trained by gradient flow on linear regression tasks. We\nshow that despite non-convexity, gradient flow with a suitable random\ninitialization finds a global minimum of the objective function. At this global\nminimum, when given a test prompt of labeled examples from a new prediction\ntask, the transformer achieves prediction error competitive with the best\nlinear predictor over the test prompt distribution. We additionally\ncharacterize the robustness of the trained transformer to a variety of\ndistribution shifts and show that although a number of shifts are tolerated,\nshifts in the covariate distribution of the prompts are not. Motivated by this,\nwe consider a generalized ICL setting where the covariate distributions can\nvary across prompts. We show that although gradient flow succeeds at finding a\nglobal minimum in this setting, the trained transformer is still brittle under\nmild covariate shifts. We complement this finding with experiments on large,\nnonlinear transformer architectures which we show are more robust under\ncovariate shifts.\n","authors":["Ruiqi Zhang","Spencer Frei","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2306.09927v2.pdf","comment":"50 pages, experiments added, reference added, typo corrected"},{"id":"http://arxiv.org/abs/2308.05884v1","updated":"2023-08-11T00:33:26Z","published":"2023-08-11T00:33:26Z","title":"PIPPA: A Partially Synthetic Conversational Dataset","summary":"  With the emergence of increasingly powerful large language models, there is a\nburgeoning interest in leveraging these models for casual conversation and\nrole-play applications. However, existing conversational and role-playing\ndatasets often fail to capture the diverse and nuanced interactions typically\nexhibited by real-world role-play participants. To address this limitation and\ncontribute to the rapidly growing field, we introduce a partially-synthetic\ndataset named PIPPA (Personal Interaction Pairs between People and AI). PIPPA\nis a result of a community-driven crowdsourcing effort involving a group of\nrole-play enthusiasts. The dataset comprises over 1 million utterances that are\ndistributed across 26,000 conversation sessions and provides a rich resource\nfor researchers and AI developers to explore and refine conversational AI\nsystems in the context of role-play scenarios.\n","authors":["Tear Gosling","Alpin Dale","Yinhe Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.05884v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06391v1","updated":"2023-08-11T21:17:13Z","published":"2023-08-11T21:17:13Z","title":"Dynamic Planning with a LLM","summary":"  While Large Language Models (LLMs) can solve many NLP tasks in zero-shot\nsettings, applications involving embodied agents remain problematic. In\nparticular, complex plans that require multi-step reasoning become difficult\nand too costly as the context window grows. Planning requires understanding the\nlikely effects of one's actions and identifying whether the current environment\nsatisfies the goal state. While symbolic planners find optimal solutions\nquickly, they require a complete and accurate representation of the planning\nproblem, severely limiting their use in practical scenarios. In contrast,\nmodern LLMs cope with noisy observations and high levels of uncertainty when\nreasoning about a task. Our work presents LLM Dynamic Planner (LLM-DP): a\nneuro-symbolic framework where an LLM works hand-in-hand with a traditional\nplanner to solve an embodied task. Given action-descriptions, LLM-DP solves\nAlfworld faster and more efficiently than a naive LLM ReAct baseline.\n","authors":["Gautier Dagan","Frank Keller","Alex Lascarides"],"pdf_url":"https://arxiv.org/pdf/2308.06391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06385v1","updated":"2023-08-11T20:59:31Z","published":"2023-08-11T20:59:31Z","title":"ZYN: Zero-Shot Reward Models with Yes-No Questions","summary":"  In this work, we address the problem of directing the text generations of a\nLLM towards a desired behavior, aligning the generated text with the\npreferences of the human operator. We propose using another language model as a\ncritic, reward model in a zero-shot way thanks to the prompt of a Yes-No\nquestion that represents the user preferences, without requiring further\nlabeled data. This zero-shot reward model provides the learning signal to\nfurther fine-tune the base LLM using reinforcement learning, as in RLAIF; yet\nour approach is also compatible in other contexts such as quality-diversity\nsearch. Extensive evidence of the capabilities of the proposed ZYN framework is\nprovided through experiments in different domains related to text generation,\nincluding detoxification; optimizing sentiment of movie reviews, or any other\nattribute; steering the opinion about a particular topic the model may have;\nand personalizing prompt generators for text-to-image tasks. Code to be\nreleased at \\url{https://github.com/vicgalle/zero-shot-reward-models/}.\n","authors":["Victor Gallego"],"pdf_url":"https://arxiv.org/pdf/2308.06385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06374v1","updated":"2023-08-11T20:16:57Z","published":"2023-08-11T20:16:57Z","title":"Large Language Models and Knowledge Graphs: Opportunities and Challenges","summary":"  Large Language Models (LLMs) have taken Knowledge Representation -- and the\nworld -- by storm. This inflection point marks a shift from explicit knowledge\nrepresentation to a renewed focus on the hybrid representation of both explicit\nknowledge and parametric knowledge. In this position paper, we will discuss\nsome of the common debate points within the community on LLMs (parametric\nknowledge) and Knowledge Graphs (explicit knowledge) and speculate on\nopportunities and visions that the renewed focus brings, as well as related\nresearch topics and challenges.\n","authors":["Jeff Z. Pan","Simon Razniewski","Jan-Christoph Kalo","Sneha Singhania","Jiaoyan Chen","Stefan Dietze","Hajira Jabeen","Janna Omeliyanenko","Wen Zhang","Matteo Lissandrini","Russa Biswas","Gerard de Melo","Angela Bonifati","Edlira Vakaj","Mauro Dragoni","Damien Graux"],"pdf_url":"https://arxiv.org/pdf/2308.06374v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2208.01066v3","updated":"2023-08-11T19:27:58Z","published":"2022-08-01T18:01:40Z","title":"What Can Transformers Learn In-Context? A Case Study of Simple Function\n  Classes","summary":"  In-context learning refers to the ability of a model to condition on a prompt\nsequence consisting of in-context examples (input-output pairs corresponding to\nsome task) along with a new query input, and generate the corresponding output.\nCrucially, in-context learning happens only at inference time without any\nparameter updates to the model. While large language models such as GPT-3\nexhibit some ability to perform in-context learning, it is unclear what the\nrelationship is between tasks on which this succeeds and what is present in the\ntraining data. To make progress towards understanding in-context learning, we\nconsider the well-defined problem of training a model to in-context learn a\nfunction class (e.g., linear functions): that is, given data derived from some\nfunctions in the class, can we train a model to in-context learn \"most\"\nfunctions from this class? We show empirically that standard Transformers can\nbe trained from scratch to perform in-context learning of linear functions --\nthat is, the trained model is able to learn unseen linear functions from\nin-context examples with performance comparable to the optimal least squares\nestimator. In fact, in-context learning is possible even under two forms of\ndistribution shift: (i) between the training data of the model and\ninference-time prompts, and (ii) between the in-context examples and the query\ninput during inference. We also show that we can train Transformers to\nin-context learn more complex function classes -- namely sparse linear\nfunctions, two-layer neural networks, and decision trees -- with performance\nthat matches or exceeds task-specific learning algorithms. Our code and models\nare available at https://github.com/dtsip/in-context-learning .\n","authors":["Shivam Garg","Dimitris Tsipras","Percy Liang","Gregory Valiant"],"pdf_url":"https://arxiv.org/pdf/2208.01066v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04566v2","updated":"2023-08-11T19:21:45Z","published":"2023-08-08T20:29:13Z","title":"Single-Sentence Reader: A Novel Approach for Addressing Answer Position\n  Bias","summary":"  Machine Reading Comprehension (MRC) models tend to take advantage of spurious\ncorrelations (also known as dataset bias or annotation artifacts in the\nresearch community). Consequently, these models may perform the MRC task\nwithout fully comprehending the given context and question, which is\nundesirable since it may result in low robustness against distribution shift.\nThis paper delves into the concept of answer-position bias, where a significant\npercentage of training questions have answers located solely in the first\nsentence of the context. We propose a Single-Sentence Reader as a new approach\nfor addressing answer position bias in MRC. We implement this approach using\nsix different models and thoroughly analyze their performance. Remarkably, our\nproposed Single-Sentence Readers achieve results that nearly match those of\nmodels trained on conventional training sets, proving their effectiveness. Our\nstudy also discusses several challenges our Single-Sentence Readers encounter\nand proposes a potential solution.\n","authors":["Son Quoc Tran","Matt Kretchmar"],"pdf_url":"https://arxiv.org/pdf/2308.04566v2.pdf","comment":"11 pages, 5 tables, 2 figures"},{"id":"http://arxiv.org/abs/2308.06354v1","updated":"2023-08-11T19:18:35Z","published":"2023-08-11T19:18:35Z","title":"Large Language Models to Identify Social Determinants of Health in\n  Electronic Health Records","summary":"  Social determinants of health (SDoH) have an important impact on patient\noutcomes but are incompletely collected from the electronic health records\n(EHR). This study researched the ability of large language models to extract\nSDoH from free text in EHRs, where they are most commonly documented, and\nexplored the role of synthetic clinical text for improving the extraction of\nthese scarcely documented, yet extremely valuable, clinical data. 800 patient\nnotes were annotated for SDoH categories, and several transformer-based models\nwere evaluated. The study also experimented with synthetic data generation and\nassessed for algorithmic bias. Our best-performing models were fine-tuned\nFlan-T5 XL (macro-F1 0.71) for any SDoH, and Flan-T5 XXL (macro-F1 0.70). The\nbenefit of augmenting fine-tuning with synthetic data varied across model\narchitecture and size, with smaller Flan-T5 models (base and large) showing the\ngreatest improvements in performance (delta F1 +0.12 to +0.23). Model\nperformance was similar on the in-hospital system dataset but worse on the\nMIMIC-III dataset. Our best-performing fine-tuned models outperformed zero- and\nfew-shot performance of ChatGPT-family models for both tasks. These fine-tuned\nmodels were less likely than ChatGPT to change their prediction when\nrace/ethnicity and gender descriptors were added to the text, suggesting less\nalgorithmic bias (p<0.05). At the patient-level, our models identified 93.8% of\npatients with adverse SDoH, while ICD-10 codes captured 2.0%. Our method can\neffectively extracted SDoH information from clinic notes, performing better\ncompare to GPT zero- and few-shot settings. These models could enhance\nreal-world evidence on SDoH and aid in identifying patients needing social\nsupport.\n","authors":["Marco Guevara","Shan Chen","Spencer Thomas","Tafadzwa L. Chaunzwa","Idalid Franco","Benjamin Kann","Shalini Moningi","Jack Qian","Madeleine Goldstein","Susan Harper","Hugo JWL Aerts","Guergana K. Savova","Raymond H. Mak","Danielle S. Bitterman"],"pdf_url":"https://arxiv.org/pdf/2308.06354v1.pdf","comment":"38 pages, 5 figures, 5 tables in main, submitted for review"},{"id":"http://arxiv.org/abs/2308.06327v1","updated":"2023-08-11T18:06:33Z","published":"2023-08-11T18:06:33Z","title":"Bilingual Streaming ASR with Grapheme units and Auxiliary Monolingual\n  Loss","summary":"  We introduce a bilingual solution to support English as secondary locale for\nmost primary locales in hybrid automatic speech recognition (ASR) settings. Our\nkey developments constitute: (a) pronunciation lexicon with grapheme units\ninstead of phone units, (b) a fully bilingual alignment model and subsequently\nbilingual streaming transformer model, (c) a parallel encoder structure with\nlanguage identification (LID) loss, (d) parallel encoder with an auxiliary loss\nfor monolingual projections. We conclude that in comparison to LID loss, our\nproposed auxiliary loss is superior in specializing the parallel encoders to\nrespective monolingual locales, and that contributes to stronger bilingual\nlearning. We evaluate our work on large-scale training and test tasks for\nbilingual Spanish (ES) and bilingual Italian (IT) applications. Our bilingual\nmodels demonstrate strong English code-mixing capability. In particular, the\nbilingual IT model improves the word error rate (WER) for a code-mix IT task\nfrom 46.5% to 13.8%, while also achieving a close parity (9.6%) with the\nmonolingual IT model (9.5%) over IT tests.\n","authors":["Mohammad Soleymanpour","Mahmoud Al Ismail","Fahimeh Bahmaninezhad","Kshitiz Kumar","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06599v2","updated":"2023-08-11T08:18:50Z","published":"2023-05-11T06:43:37Z","title":"Structured Chain-of-Thought Prompting for Code Generation","summary":"  Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive\nperformance in code generation. LLMs take prompts as inputs, and\nChain-of-Thought (CoT) prompting is the state-of-the-art prompting technique.\nCoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural\nlanguage reasoning steps) and then output the code. However, CoT prompting is\ndesigned for natural language generation and has low accuracy in code\ngeneration.\n  In this paper, we propose Structured CoTs (SCoTs) and present a novel\nprompting technique for code generation, named SCoT prompting. Our motivation\nis source code contains rich structural information and any code can be\ncomposed of three program structures (i.e., sequence, branch, and loop\nstructures). Intuitively, structured intermediate reasoning steps make for\nstructured source code. Thus, we ask LLMs to use program structures to build\nCoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs.\nCompared to CoT prompting, SCoT prompting explicitly constrains LLMs to think\nabout how to solve requirements from the view of source code and further the\nperformance of LLMs in code generation. We apply SCoT prompting to two LLMs\n(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval,\nMBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline\n- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human\ndevelopers prefer programs from SCoT prompting. (3) SCoT prompting is robust to\nexamples and achieves substantial improvements.\n","authors":["Jia Allen Li","Ge Li","Yongmin Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2305.06599v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.17780"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":"  By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2308.06248v1","updated":"2023-08-11T17:29:02Z","published":"2023-08-11T17:29:02Z","title":"FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of\n  Explainable AI Methods","summary":"  The field of explainable artificial intelligence (XAI) aims to uncover the\ninner workings of complex deep neural models. While being crucial for\nsafety-critical domains, XAI inherently lacks ground-truth explanations, making\nits automatic evaluation an unsolved problem. We address this challenge by\nproposing a novel synthetic vision dataset, named FunnyBirds, and accompanying\nautomatic evaluation protocols. Our dataset allows performing semantically\nmeaningful image interventions, e.g., removing individual object parts, which\nhas three important implications. First, it enables analyzing explanations on a\npart level, which is closer to human comprehension than existing methods that\nevaluate on a pixel level. Second, by comparing the model output for inputs\nwith removed parts, we can estimate ground-truth part importances that should\nbe reflected in the explanations. Third, by mapping individual explanations\ninto a common space of part importances, we can analyze a variety of different\nexplanation types in a single common framework. Using our tools, we report\nresults for 24 different combinations of neural models and XAI methods,\ndemonstrating the strengths and weaknesses of the assessed methods in a fully\nautomatic and systematic manner.\n","authors":["Robin Hesse","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.06248v1.pdf","comment":"Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds"},{"id":"http://arxiv.org/abs/2305.20048v3","updated":"2023-08-11T17:26:42Z","published":"2023-05-31T17:21:58Z","title":"F?D: On understanding the role of deep feature spaces on face generation\n  evaluation","summary":"  Perceptual metrics, like the Fr\\'echet Inception Distance (FID), are widely\nused to assess the similarity between synthetically generated and ground truth\n(real) images. The key idea behind these metrics is to compute errors in a deep\nfeature space that captures perceptually and semantically rich image features.\nDespite their popularity, the effect that different deep features and their\ndesign choices have on a perceptual metric has not been well studied. In this\nwork, we perform a causal analysis linking differences in semantic attributes\nand distortions between face image distributions to Fr\\'echet distances (FD)\nusing several popular deep feature spaces. A key component of our analysis is\nthe creation of synthetic counterfactual faces using deep face generators. Our\nexperiments show that the FD is heavily influenced by its feature space's\ntraining dataset and objective function. For example, FD using features\nextracted from ImageNet-trained models heavily emphasize hats over regions like\nthe eyes and mouth. Moreover, FD using features from a face gender classifier\nemphasize hair length more than distances in an identity (recognition) feature\nspace. Finally, we evaluate several popular face generation models across\nfeature spaces and find that StyleGAN2 consistently ranks higher than other\nface generators, except with respect to identity (recognition) features. This\nsuggests the need for considering multiple feature spaces when evaluating\ngenerative models and using feature spaces that are tuned to nuances of the\ndomain of interest.\n","authors":["Krish Kabra","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2305.20048v3.pdf","comment":"Code and dataset to be released soon"},{"id":"http://arxiv.org/abs/2308.06217v1","updated":"2023-08-11T16:37:31Z","published":"2023-08-11T16:37:31Z","title":"Continual Face Forgery Detection via Historical Distribution Preserving","summary":"  Face forgery techniques have advanced rapidly and pose serious security\nthreats. Existing face forgery detection methods try to learn generalizable\nfeatures, but they still fall short of practical application. Additionally,\nfinetuning these methods on historical training data is resource-intensive in\nterms of time and storage. In this paper, we focus on a novel and challenging\nproblem: Continual Face Forgery Detection (CFFD), which aims to efficiently\nlearn from new forgery attacks without forgetting previous ones. Specifically,\nwe propose a Historical Distribution Preserving (HDP) framework that reserves\nand preserves the distributions of historical faces. To achieve this, we use\nuniversal adversarial perturbation (UAP) to simulate historical forgery\ndistribution, and knowledge distillation to maintain the distribution variation\nof real faces across different models. We also construct a new benchmark for\nCFFD with three evaluation protocols. Our extensive experiments on the\nbenchmarks show that our method outperforms the state-of-the-art competitors.\n","authors":["Ke Sun","Shen Chen","Taiping Yao","Xiaoshuai Sun","Shouhong Ding","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.06217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06202v1","updated":"2023-08-11T15:57:45Z","published":"2023-08-11T15:57:45Z","title":"Exploring Predicate Visual Context in Detecting of Human-Object\n  Interactions","summary":"  Recently, the DETR framework has emerged as the dominant approach for\nhuman--object interaction (HOI) research. In particular, two-stage\ntransformer-based HOI detectors are amongst the most performant and\ntraining-efficient approaches. However, these often condition HOI\nclassification on object features that lack fine-grained contextual\ninformation, eschewing pose and orientation information in favour of visual\ncues about object identity and box extremities. This naturally hinders the\nrecognition of complex or ambiguous interactions. In this work, we study these\nissues through visualisations and carefully designed experiments. Accordingly,\nwe investigate how best to re-introduce image features via cross-attention.\nWith an improved query design, extensive exploration of keys and values, and\nbox pair positional embeddings as spatial guidance, our model with enhanced\npredicate visual context (PViC) outperforms state-of-the-art methods on the\nHICO-DET and V-COCO benchmarks, while maintaining low training cost.\n","authors":["Frederic Z. Zhang","Yuhui Yuan","Dylan Campbell","Zhuoyao Zhong","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2308.06202v1.pdf","comment":"To appear in ICCV2023"},{"id":"http://arxiv.org/abs/2303.06628v2","updated":"2023-08-11T15:56:32Z","published":"2023-03-12T10:28:07Z","title":"Preventing Zero-Shot Transfer Degradation in Continual Learning of\n  Vision-Language Models","summary":"  Continual learning (CL) can help pre-trained vision-language models\nefficiently adapt to new or under-trained data distributions without\nre-training. Nevertheless, during the continual training of the Contrastive\nLanguage-Image Pre-training (CLIP) model, we observe that the model's zero-shot\ntransfer ability significantly degrades due to catastrophic forgetting.\nExisting CL methods can mitigate forgetting by replaying previous data.\nHowever, since the CLIP dataset is private, replay methods cannot access the\npre-training dataset. In addition, replaying data of previously learned\ndownstream tasks can enhance their performance but comes at the cost of\nsacrificing zero-shot performance. To address this challenge, we propose a\nnovel method ZSCL to prevent zero-shot transfer degradation in the continual\nlearning of vision-language models in both feature and parameter space. In the\nfeature space, a reference dataset is introduced for distillation between the\ncurrent and initial models. The reference dataset should have semantic\ndiversity but no need to be labeled, seen in pre-training, or matched\nimage-text pairs. In parameter space, we prevent a large parameter shift by\naveraging weights during the training. We propose a more challenging\nMulti-domain Task Incremental Learning (MTIL) benchmark to evaluate different\nmethods, where tasks are from various domains instead of class-separated in a\nsingle dataset. Our method outperforms other methods in the traditional\nclass-incremental learning setting and the MTIL by 9.7% average score. Our code\nlocates at https://github.com/Thunderbeee/ZSCL.\n","authors":["Zangwei Zheng","Mingyuan Ma","Kai Wang","Ziheng Qin","Xiangyu Yue","Yang You"],"pdf_url":"https://arxiv.org/pdf/2303.06628v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06198v1","updated":"2023-08-11T15:43:37Z","published":"2023-08-11T15:43:37Z","title":"DIG In: Evaluating Disparities in Image Generations with Indicators for\n  Geographic Diversity","summary":"  The unprecedented photorealistic results achieved by recent text-to-image\ngenerative systems and their increasing use as plug-and-play content creation\nsolutions make it crucial to understand their potential biases. In this work,\nwe introduce three indicators to evaluate the realism, diversity and\nprompt-generation consistency of text-to-image generative systems when prompted\nto generate objects from across the world. Our indicators complement\nqualitative analysis of the broader impact of such systems by enabling\nautomatic and efficient benchmarking of geographic disparities, an important\nstep towards building responsible visual content creation systems. We use our\nproposed indicators to analyze potential geographic biases in state-of-the-art\nvisual content creation systems and find that: (1) models have less realism and\ndiversity of generations when prompting for Africa and West Asia than Europe,\n(2) prompting with geographic information comes at a cost to prompt-consistency\nand diversity of generated images, and (3) models exhibit more region-level\ndisparities for some objects than others. Perhaps most interestingly, our\nindicators suggest that progress in image generation quality has come at the\ncost of real-world geographic representation. Our comprehensive evaluation\nconstitutes a crucial step towards ensuring a positive experience of visual\ncontent creation for everyone.\n","authors":["Melissa Hall","Candace Ross","Adina Williams","Nicolas Carion","Michal Drozdzal","Adriana Romero Soriano"],"pdf_url":"https://arxiv.org/pdf/2308.06198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06197v1","updated":"2023-08-11T15:42:48Z","published":"2023-08-11T15:42:48Z","title":"Complex Facial Expression Recognition Using Deep Knowledge Distillation\n  of Basic Features","summary":"  Complex emotion recognition is a cognitive task that has so far eluded the\nsame excellent performance of other tasks that are at or above the level of\nhuman cognition. Emotion recognition through facial expressions is particularly\ndifficult due to the complexity of emotions expressed by the human face. For a\nmachine to approach the same level of performance in this domain as a human, it\nmay need to synthesise knowledge and understand new concepts in real-time as\nhumans do. Humans are able to learn new concepts using only few examples, by\ndistilling the important information from memories and discarding the rest.\nSimilarly, continual learning methods learn new classes whilst retaining the\nknowledge of known classes, whilst few-shot learning methods are able to learn\nnew classes using very few training examples. We propose a novel continual\nlearning method inspired by human cognition and learning that can accurately\nrecognise new compound expression classes using few training samples, by\nbuilding on and retaining its knowledge of basic expression classes. Using\nGradCAM visualisations, we demonstrate the relationship between basic and\ncompound facial expressions, which our method leverages through knowledge\ndistillation and a novel Predictive Sorting Memory Replay. Our method achieves\nthe current state-of-the-art in continual learning for complex facial\nexpression recognition with 74.28% Overall Accuracy on new classes. We also\ndemonstrate that using continual learning for complex facial expression\nrecognition achieves far better performance than non-continual learning\nmethods, improving on state-of-the-art non-continual learning methods by\n13.95%. To the best of our knowledge, our work is also the first to apply\nfew-shot learning to complex facial expression recognition, achieving the\nstate-of-the-art with 100% accuracy using a single training sample for each\nexpression class.\n","authors":["Angus Maiden","Bahareh Nakisa"],"pdf_url":"https://arxiv.org/pdf/2308.06197v1.pdf","comment":"17 pages, 9 figures, 6 tables. Code available at\n  https://github.com/AngusMaiden/complex-FER"},{"id":"http://arxiv.org/abs/2304.09466v2","updated":"2023-08-11T15:30:29Z","published":"2023-04-19T07:27:21Z","title":"MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke\n  Diagnosis","summary":"  Stroke is a major cause of mortality and disability worldwide from which one\nin four people are in danger of incurring in their lifetime. The pre-hospital\nstroke assessment plays a vital role in identifying stroke patients accurately\nto accelerate further examination and treatment in hospitals. Accordingly, the\nNational Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital\nStroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests\nfor stroke assessment. However, the validity of these tests is skeptical in the\nabsence of neurologists and access to healthcare may be limited. Therefore, in\nthis study, we propose a motion-aware and multi-attention fusion network\n(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary\nto other studies on stroke detection from video analysis, our study for the\nfirst time proposes an end-to-end solution from multiple video recordings of\neach subject with a dataset encapsulating stroke, transient ischemic attack\n(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware\nmodules to sense the mobility of patients, attention modules to fuse the\nmulti-input video data, and 3D convolutional layers to perform diagnosis from\nthe attention-based extracted features. Experimental results over the collected\nStroke-data dataset show that the proposed MAMAF-Net achieves a successful\ndetection of stroke with 93.62% sensitivity and 95.33% AUC score.\n","authors":["Aysen Degerli","Pekka Jakala","Juha Pajula","Milla Immonen","Miguel Bordallo Lopez"],"pdf_url":"https://arxiv.org/pdf/2304.09466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06024v2","updated":"2023-08-11T15:23:21Z","published":"2023-05-10T10:19:31Z","title":"Larger is not Better: A Survey on the Robustness of Computer Vision\n  Models against Common Corruptions","summary":"  The performance of computer vision models are susceptible to unexpected\nchanges in input images, known as common corruptions (e.g. noise, blur,\nillumination changes, etc.), that can hinder their reliability when deployed in\nreal scenarios. These corruptions are not always considered to test model\ngeneralization and robustness. In this survey, we present a comprehensive\noverview of methods that improve the robustness of computer vision models\nagainst common corruptions. We categorize methods into four groups based on the\nmodel part and training method addressed: data augmentation, representation\nlearning, knowledge distillation, and network components. We also cover\nindirect methods for generalization and mitigation of shortcut learning,\npotentially useful for corruption robustness. We release a unified benchmark\nframework to compare robustness performance on several datasets, and address\nthe inconsistencies of evaluation in the literature. We provide an experimental\noverview of the base corruption robustness of popular vision backbones, and\nshow that corruption robustness does not necessarily scale with model size. The\nvery large models (above 100M parameters) gain negligible robustness,\nconsidering the increased computational requirements. To achieve generalizable\nand robust computer vision models, we foresee the need of developing new\nlearning strategies to efficiently exploit limited data and mitigate unwanted\nor unreliable learning behaviors.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2305.06024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06173v1","updated":"2023-08-11T15:02:19Z","published":"2023-08-11T15:02:19Z","title":"Physical Adversarial Attacks For Camera-based Smart Systems: Current\n  Trends, Categorization, Applications, Research Challenges, and Future Outlook","summary":"  In this paper, we present a comprehensive survey of the current trends\nfocusing specifically on physical adversarial attacks. We aim to provide a\nthorough understanding of the concept of physical adversarial attacks,\nanalyzing their key characteristics and distinguishing features. Furthermore,\nwe explore the specific requirements and challenges associated with executing\nattacks in the physical world. Our article delves into various physical\nadversarial attack methods, categorized according to their target tasks in\ndifferent applications, including classification, detection, face recognition,\nsemantic segmentation and depth estimation. We assess the performance of these\nattack methods in terms of their effectiveness, stealthiness, and robustness.\nWe examine how each technique strives to ensure the successful manipulation of\nDNNs while mitigating the risk of detection and withstanding real-world\ndistortions. Lastly, we discuss the current challenges and outline potential\nfuture research directions in the field of physical adversarial attacks. We\nhighlight the need for enhanced defense mechanisms, the exploration of novel\nattack strategies, the evaluation of attacks in different application domains,\nand the establishment of standardized benchmarks and evaluation criteria for\nphysical adversarial attacks. Through this comprehensive survey, we aim to\nprovide a valuable resource for researchers, practitioners, and policymakers to\ngain a holistic understanding of physical adversarial attacks in computer\nvision and facilitate the development of robust and secure DNN-based systems.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammed Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.06173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06161v1","updated":"2023-08-11T14:38:51Z","published":"2023-08-11T14:38:51Z","title":"Rethinking the Localization in Weakly Supervised Object Localization","summary":"  Weakly supervised object localization (WSOL) is one of the most popular and\nchallenging tasks in computer vision. This task is to localize the objects in\nthe images given only the image-level supervision. Recently, dividing WSOL into\ntwo parts (class-agnostic object localization and object classification) has\nbecome the state-of-the-art pipeline for this task. However, existing solutions\nunder this pipeline usually suffer from the following drawbacks: 1) they are\nnot flexible since they can only localize one object for each image due to the\nadopted single-class regression (SCR) for localization; 2) the generated pseudo\nbounding boxes may be noisy, but the negative impact of such noise is not well\naddressed. To remedy these drawbacks, we first propose to replace SCR with a\nbinary-class detector (BCD) for localizing multiple objects, where the detector\nis trained by discriminating the foreground and background. Then we design a\nweighted entropy (WE) loss using the unlabeled data to reduce the negative\nimpact of noisy bounding boxes. Extensive experiments on the popular\nCUB-200-2011 and ImageNet-1K datasets demonstrate the effectiveness of our\nmethod.\n","authors":["Rui Xu","Yong Luo","Han Hu","Bo Du","Jialie Shen","Yonggang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.06161v1.pdf","comment":"Accepted by ACM International Conference on Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.06160v1","updated":"2023-08-11T14:38:11Z","published":"2023-08-11T14:38:11Z","title":"DatasetDM: Synthesizing Data with Perception Annotations Using Diffusion\n  Models","summary":"  Current deep networks are very data-hungry and benefit from training on\nlargescale datasets, which are often time-consuming to collect and annotate. By\ncontrast, synthetic data can be generated infinitely using generative models\nsuch as DALL-E and diffusion models, with minimal effort and cost. In this\npaper, we present DatasetDM, a generic dataset generation model that can\nproduce diverse synthetic images and the corresponding high-quality perception\nannotations (e.g., segmentation masks, and depth). Our method builds upon the\npre-trained diffusion model and extends text-guided image synthesis to\nperception data generation. We show that the rich latent code of the diffusion\nmodel can be effectively decoded as accurate perception annotations using a\ndecoder module. Training the decoder only needs less than 1% (around 100\nimages) manually labeled images, enabling the generation of an infinitely large\nannotated dataset. Then these synthetic data can be used for training various\nperception models for downstream tasks. To showcase the power of the proposed\napproach, we generate datasets with rich dense pixel-wise labels for a wide\nrange of downstream tasks, including semantic segmentation, instance\nsegmentation, and depth estimation. Notably, it achieves 1) state-of-the-art\nresults on semantic segmentation and instance segmentation; 2) significantly\nmore robust on domain generalization than using the real data alone; and\nstate-of-the-art results in zero-shot segmentation setting; and 3) flexibility\nfor efficient application and novel task composition (e.g., image editing). The\nproject website and code can be found at\nhttps://weijiawu.github.io/DatasetDM_page/ and\nhttps://github.com/showlab/DatasetDM, respectively\n","authors":["Weijia Wu","Yuzhong Zhao","Hao Chen","Yuchao Gu","Rui Zhao","Yefei He","Hong Zhou","Mike Zheng Shou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06147v1","updated":"2023-08-11T14:24:03Z","published":"2023-08-11T14:24:03Z","title":"Efficient Large-scale AUV-based Visual Seafloor Mapping","summary":"  Driven by the increasing number of marine data science applications, there is\na growing interest in surveying and exploring the vast, uncharted terrain of\nthe deep sea with robotic platforms. Despite impressive results achieved by\nmany on-land visual mapping algorithms in the past decades, transferring these\nmethods from land to the deep sea remains a challenge due to harsh\nenvironmental conditions. Typically, deep-sea exploration involves the use of\nautonomous underwater vehicles (AUVs) equipped with high-resolution cameras and\nartificial illumination systems. However, images obtained in this manner often\nsuffer from heterogeneous illumination and quality degradation due to\nattenuation and scattering, on top of refraction of light rays. All of this\ntogether often lets on-land SLAM approaches fail underwater or makes\nStructure-from-Motion approaches drift or omit difficult images, resulting in\ngaps, jumps or weakly registered areas. In this work, we present a system that\nincorporates recent developments in underwater imaging and visual mapping to\nfacilitate automated robotic 3D reconstruction of hectares of seafloor. Our\napproach is efficient in that it detects and reconsiders difficult, weakly\nregistered areas, to avoid omitting images and to make better use of limited\ndive time; on the other hand it is computationally efficient; leveraging a\nhybrid approach combining benefits from SLAM and Structure-from-Motion that\nruns much faster than incremental reconstructions while achieving at least\non-par performance. The proposed system has been extensively tested and\nevaluated during several research cruises, demonstrating its robustness and\npracticality in real-world conditions.\n","authors":["Mengkun She","Yifan Song","David Nakath","Kevin Köser"],"pdf_url":"https://arxiv.org/pdf/2308.06147v1.pdf","comment":"27 pages, 21 figures"},{"id":"http://arxiv.org/abs/2307.08106v3","updated":"2023-08-11T14:04:13Z","published":"2023-07-16T17:14:39Z","title":"Polarization Multi-Image Synthesis with Birefringent Metasurfaces","summary":"  Optical metasurfaces composed of precisely engineered nanostructures have\ngained significant attention for their ability to manipulate light and\nimplement distinct functionalities based on the properties of the incident\nfield. Computational imaging systems have started harnessing this capability to\nproduce sets of coded measurements that benefit certain tasks when paired with\ndigital post-processing. Inspired by these works, we introduce a new system\nthat uses a birefringent metasurface with a polarizer-mosaicked photosensor to\ncapture four optically-coded measurements in a single exposure. We apply this\nsystem to the task of incoherent opto-electronic filtering, where digital\nspatial-filtering operations are replaced by simpler, per-pixel sums across the\nfour polarization channels, independent of the spatial filter size. In contrast\nto previous work on incoherent opto-electronic filtering that can realize only\none spatial filter, our approach can realize a continuous family of filters\nfrom a single capture, with filters being selected from the family by adjusting\nthe post-capture digital summation weights. To find a metasurface that can\nrealize a set of user-specified spatial filters, we introduce a form of\ngradient descent with a novel regularizer that encourages light efficiency and\na high signal-to-noise ratio. We demonstrate several examples in simulation and\nwith fabricated prototypes, including some with spatial filters that have\nprescribed variations with respect to depth and wavelength.\n  Visit the Project Page at\nhttps://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html\n","authors":["Dean Hazineh","Soon Wei Daniel Lim","Qi Guo","Federico Capasso","Todd Zickler"],"pdf_url":"https://arxiv.org/pdf/2307.08106v3.pdf","comment":"Published in the Proceedings of the 2023 IEEE International\n  Conference of Computational Photography"},{"id":"http://arxiv.org/abs/2308.06142v1","updated":"2023-08-11T14:02:52Z","published":"2023-08-11T14:02:52Z","title":"CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging\n  Handwritten Documents using Deep Feature Learning from JPEG Coefficients","summary":"  Automatic localization of text-lines in handwritten documents is still an\nopen and challenging research problem. Various writing issues such as uneven\nspacing between the lines, oscillating and touching text, and the presence of\nskew become much more challenging when the case of complex handwritten document\nimages are considered for segmentation directly in their respective compressed\nrepresentation. This is because, the conventional way of processing compressed\ndocuments is through decompression, but here in this paper, we propose an idea\nthat employs deep feature learning directly from the JPEG compressed\ncoefficients without full decompression to accomplish text-line localization in\nthe JPEG compressed domain. A modified U-Net architecture known as Compressed\nText-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The\nmodel is trained and tested with JPEG compressed version of benchmark datasets\nincluding ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art\nperformance with reduced storage and computational costs in the JPEG compressed\ndomain.\n","authors":["Bulla Rajesh","Sk Mahafuz Zaman","Mohammed Javed","P. Nagabhushan"],"pdf_url":"https://arxiv.org/pdf/2308.06142v1.pdf","comment":"Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),\n  5-8 November 2023, Kitakyushu, Japan"},{"id":"http://arxiv.org/abs/2308.03409v2","updated":"2023-08-11T13:53:19Z","published":"2023-08-07T08:55:48Z","title":"DiT: Efficient Vision Transformers with Dynamic Token Routing","summary":"  Recently, the tokens of images share the same static data flow in many dense\nnetworks. However, challenges arise from the variance among the objects in\nimages, such as large variations in the spatial scale and difficulties of\nrecognition for visual entities. In this paper, we propose a data-dependent\ntoken routing strategy to elaborate the routing paths of image tokens for\nDynamic Vision Transformer, dubbed DiT. The proposed framework generates a\ndata-dependent path per token, adapting to the object scales and visual\ndiscrimination of tokens. In feed-forward, the differentiable routing gates are\ndesigned to select the scaling paths and feature transformation paths for image\ntokens, leading to multi-path feature propagation. In this way, the impact of\nobject scales and visual discrimination of image representation can be\ncarefully tuned. Moreover, the computational cost can be further reduced by\ngiving budget constraints to the routing gate and early-stopping of feature\nextraction. In experiments, our DiT achieves superior performance and favorable\ncomplexity/accuracy trade-offs than many SoTA methods on ImageNet\nclassification, object detection, instance segmentation, and semantic\nsegmentation. Particularly, the DiT-B5 obtains 84.8\\% top-1 Acc on ImageNet\nwith 10.3 GFLOPs, which is 1.0\\% higher than that of the SoTA method with\nsimilar computational complexity. These extensive results demonstrate that DiT\ncan serve as versatile backbones for various vision tasks.\n","authors":["Yuchen Ma","Zhengcong Fei","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08115v2","updated":"2023-08-11T13:36:35Z","published":"2022-11-15T13:09:54Z","title":"Heatmap-based Out-of-Distribution Detection","summary":"  Our work investigates out-of-distribution (OOD) detection as a neural network\noutput explanation problem. We learn a heatmap representation for detecting OOD\nimages while visualizing in- and out-of-distribution image regions at the same\ntime. Given a trained and fixed classifier, we train a decoder neural network\nto produce heatmaps with zero response for in-distribution samples and high\nresponse heatmaps for OOD samples, based on the classifier features and the\nclass prediction. Our main innovation lies in the heatmap definition for an OOD\nsample, as the normalized difference from the closest in-distribution sample.\nThe heatmap serves as a margin to distinguish between in- and\nout-of-distribution samples. Our approach generates the heatmaps not only for\nOOD detection, but also to indicate in- and out-of-distribution regions of the\ninput image. In our evaluations, our approach mostly outperforms the prior work\non fixed classifiers, trained on CIFAR-10, CIFAR-100 and Tiny ImageNet. The\ncode is publicly available at: https://github.com/jhornauer/heatmap_ood.\n","authors":["Julia Hornauer","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2211.08115v2.pdf","comment":"Accepted to WACV 2023"},{"id":"http://arxiv.org/abs/2308.06129v1","updated":"2023-08-11T13:35:52Z","published":"2023-08-11T13:35:52Z","title":"Uncertainty Quantification for Image-based Traffic Prediction across\n  Cities","summary":"  Despite the strong predictive performance of deep learning models for traffic\nprediction, their widespread deployment in real-world intelligent\ntransportation systems has been restrained by a lack of interpretability.\nUncertainty quantification (UQ) methods provide an approach to induce\nprobabilistic reasoning, improve decision-making and enhance model deployment\npotential. To gain a comprehensive picture of the usefulness of existing UQ\nmethods for traffic prediction and the relation between obtained uncertainties\nand city-wide traffic dynamics, we investigate their application to a\nlarge-scale image-based traffic dataset spanning multiple cities and time\nperiods. We compare two epistemic and two aleatoric UQ methods on both temporal\nand spatio-temporal transfer tasks, and find that meaningful uncertainty\nestimates can be recovered. We further demonstrate how uncertainty estimates\ncan be employed for unsupervised outlier detection on changes in city traffic\ndynamics. We find that our approach can capture both temporal and spatial\neffects on traffic behaviour in a representative case study for the city of\nMoscow. Our work presents a further step towards boosting uncertainty awareness\nin traffic prediction tasks, and aims to highlight the value contribution of UQ\nmethods to a better understanding of city traffic dynamics.\n","authors":["Alexander Timans","Nina Wiedemann","Nishant Kumar","Ye Hong","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2308.06129v1.pdf","comment":"39 pages, 22 figures. Code publicly available at:\n  https://github.com/alextimans/traffic4cast-uncertainty"},{"id":"http://arxiv.org/abs/2012.01654v2","updated":"2023-08-11T12:57:04Z","published":"2020-12-03T02:26:01Z","title":"Towards Defending Multiple $\\ell_p$-norm Bounded Adversarial\n  Perturbations via Gated Batch Normalization","summary":"  There has been extensive evidence demonstrating that deep neural networks are\nvulnerable to adversarial examples, which motivates the development of defenses\nagainst adversarial attacks. Existing adversarial defenses typically improve\nmodel robustness against individual specific perturbation types (\\eg,\n$\\ell_{\\infty}$-norm bounded adversarial examples). However, adversaries are\nlikely to generate multiple types of perturbations in practice (\\eg, $\\ell_1$,\n$\\ell_2$, and $\\ell_{\\infty}$ perturbations). Some recent methods improve model\nrobustness against adversarial attacks in multiple $\\ell_p$ balls, but their\nperformance against each perturbation type is still far from satisfactory. In\nthis paper, we observe that different $\\ell_p$ bounded adversarial\nperturbations induce different statistical properties that can be separated and\ncharacterized by the statistics of Batch Normalization (BN). We thus propose\nGated Batch Normalization (GBN) to adversarially train a perturbation-invariant\npredictor for defending multiple $\\ell_p$ bounded adversarial perturbations.\nGBN consists of a multi-branch BN layer and a gated sub-network. Each BN branch\nin GBN is in charge of one perturbation type to ensure that the normalized\noutput is aligned towards learning perturbation-invariant representation.\nMeanwhile, the gated sub-network is designed to separate inputs added with\ndifferent perturbation types. We perform an extensive evaluation of our\napproach on commonly-used dataset including MNIST, CIFAR-10, and Tiny-ImageNet,\nand demonstrate that GBN outperforms previous defense proposals against\nmultiple perturbation types (\\ie, $\\ell_1$, $\\ell_2$, and $\\ell_{\\infty}$\nperturbations) by large margins.\n","authors":["Aishan Liu","Shiyu Tang","Xinyun Chen","Lei Huang","Haotong Qin","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2012.01654v2.pdf","comment":"Accepted on IJCV"},{"id":"http://arxiv.org/abs/2308.02525v2","updated":"2023-08-11T12:31:02Z","published":"2023-07-31T13:07:56Z","title":"Can Self-Supervised Representation Learning Methods Withstand\n  Distribution Shifts and Corruptions?","summary":"  Self-supervised learning in computer vision aims to leverage the inherent\nstructure and relationships within data to learn meaningful representations\nwithout explicit human annotation, enabling a holistic understanding of visual\nscenes. Robustness in vision machine learning ensures reliable and consistent\nperformance, enhancing generalization, adaptability, and resistance to noise,\nvariations, and adversarial attacks. Self-supervised paradigms, namely\ncontrastive learning, knowledge distillation, mutual information maximization,\nand clustering, have been considered to have shown advances in invariant\nlearning representations. This work investigates the robustness of learned\nrepresentations of self-supervised learning approaches focusing on distribution\nshifts and image corruptions in computer vision. Detailed experiments have been\nconducted to study the robustness of self-supervised learning methods on\ndistribution shifts and image corruptions. The empirical analysis demonstrates\na clear relationship between the performance of learned representations within\nself-supervised paradigms and the severity of distribution shifts and\ncorruptions. Notably, higher levels of shifts and corruptions are found to\nsignificantly diminish the robustness of the learned representations. These\nfindings highlight the critical impact of distribution shifts and image\ncorruptions on the performance and resilience of self-supervised learning\nmethods, emphasizing the need for effective strategies to mitigate their\nadverse effects. The study strongly advocates for future research in the field\nof self-supervised representation learning to prioritize the key aspects of\nsafety and robustness in order to ensure practical applicability. The source\ncode and results are available on GitHub.\n","authors":["Prakash Chandra Chhipa","Johan Rodahl Holmgren","Kanjar De","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.02525v2.pdf","comment":"Accepted at 2023 IEEE/CVF International Conference on Computer Vision\n  Workshops (ICCVW). Corresponding author - prakash.chandra.chhipa@ltu.se"},{"id":"http://arxiv.org/abs/2303.15109v2","updated":"2023-08-11T12:27:42Z","published":"2023-03-27T11:26:34Z","title":"Improving the Transferability of Adversarial Examples via Direction\n  Tuning","summary":"  In the transfer-based adversarial attacks, adversarial examples are only\ngenerated by the surrogate models and achieve effective perturbation in the\nvictim models. Although considerable efforts have been developed on improving\nthe transferability of adversarial examples generated by transfer-based\nadversarial attacks, our investigation found that, the big deviation between\nthe actual and steepest update directions of the current transfer-based\nadversarial attacks is caused by the large update step length, resulting in the\ngenerated adversarial examples can not converge well. However, directly\nreducing the update step length will lead to serious update oscillation so that\nthe generated adversarial examples also can not achieve great transferability\nto the victim models. To address these issues, a novel transfer-based attack,\nnamely direction tuning attack, is proposed to not only decrease the update\ndeviation in the large step length, but also mitigate the update oscillation in\nthe small sampling step length, thereby making the generated adversarial\nexamples converge well to achieve great transferability on victim models. In\naddition, a network pruning method is proposed to smooth the decision boundary,\nthereby further decreasing the update oscillation and enhancing the\ntransferability of the generated adversarial examples. The experiment results\non ImageNet demonstrate that the average attack success rate (ASR) of the\nadversarial examples generated by our method can be improved from 87.9\\% to\n94.5\\% on five victim models without defenses, and from 69.1\\% to 76.2\\% on\neight advanced defense methods, in comparison with that of latest\ngradient-based attacks.\n","authors":["Xiangyuan Yang","Jie Lin","Hanlin Zhang","Xinyu Yang","Peng Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.15109v2.pdf","comment":"Accepted by INS 2023"},{"id":"http://arxiv.org/abs/2308.06101v1","updated":"2023-08-11T12:23:09Z","published":"2023-08-11T12:23:09Z","title":"Taming the Power of Diffusion Models for High-Quality Virtual Try-On\n  with Appearance Flow","summary":"  Virtual try-on is a critical image synthesis task that aims to transfer\nclothes from one image to another while preserving the details of both humans\nand clothes. While many existing methods rely on Generative Adversarial\nNetworks (GANs) to achieve this, flaws can still occur, particularly at high\nresolutions. Recently, the diffusion model has emerged as a promising\nalternative for generating high-quality images in various applications.\nHowever, simply using clothes as a condition for guiding the diffusion model to\ninpaint is insufficient to maintain the details of the clothes. To overcome\nthis challenge, we propose an exemplar-based inpainting approach that leverages\na warping module to guide the diffusion model's generation effectively. The\nwarping module performs initial processing on the clothes, which helps to\npreserve the local details of the clothes. We then combine the warped clothes\nwith clothes-agnostic person image and add noise as the input of diffusion\nmodel. Additionally, the warped clothes is used as local conditions for each\ndenoising process to ensure that the resulting output retains as much detail as\npossible. Our approach, namely Diffusion-based Conditional Inpainting for\nVirtual Try-ON (DCI-VTON), effectively utilizes the power of the diffusion\nmodel, and the incorporation of the warping module helps to produce\nhigh-quality and realistic virtual try-on results. Experimental results on\nVITON-HD demonstrate the effectiveness and superiority of our method.\n","authors":["Junhong Gou","Siyu Sun","Jianfu Zhang","Jianlou Si","Chen Qian","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06101v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.06100v1","updated":"2023-08-11T12:22:37Z","published":"2023-08-11T12:22:37Z","title":"Diffusion-based Visual Counterfactual Explanations -- Towards Systematic\n  Quantitative Evaluation","summary":"  Latest methods for visual counterfactual explanations (VCE) harness the power\nof deep generative models to synthesize new examples of high-dimensional images\nof impressive quality. However, it is currently difficult to compare the\nperformance of these VCE methods as the evaluation procedures largely vary and\noften boil down to visual inspection of individual examples and small scale\nuser studies. In this work, we propose a framework for systematic, quantitative\nevaluation of the VCE methods and a minimal set of metrics to be used. We use\nthis framework to explore the effects of certain crucial design choices in the\nlatest diffusion-based generative models for VCEs of natural image\nclassification (ImageNet). We conduct a battery of ablation-like experiments,\ngenerating thousands of VCEs for a suite of classifiers of various complexity,\naccuracy and robustness. Our findings suggest multiple directions for future\nadvancements and improvements of VCE methods. By sharing our methodology and\nour approach to tackle the computational challenges of such a study on a\nlimited hardware setup (including the complete code base), we offer a valuable\nguidance for researchers in the field fostering consistency and transparency in\nthe assessment of counterfactual explanations.\n","authors":["Philipp Vaeth","Alexander M. Fruehwald","Benjamin Paassen","Magda Gregorova"],"pdf_url":"https://arxiv.org/pdf/2308.06100v1.pdf","comment":"Accepted at the 5th International Workshop on eXplainable Knowledge\n  Discovery in Data Mining @ ECML 2023"},{"id":"http://arxiv.org/abs/2306.04542v2","updated":"2023-08-11T12:20:50Z","published":"2023-06-07T15:46:47Z","title":"On the Design Fundamentals of Diffusion Models: A Survey","summary":"  Diffusion models are generative models, which gradually add and remove noise\nto learn the underlying distribution of training data for data generation. The\ncomponents of diffusion models have gained significant attention with many\ndesign choices proposed. Existing reviews have primarily focused on\nhigher-level solutions, thereby covering less on the design fundamentals of\ncomponents. This study seeks to address this gap by providing a comprehensive\nand coherent review on component-wise design choices in diffusion models.\nSpecifically, we organize this review according to their three key components,\nnamely the forward process, the reverse process, and the sampling procedure.\nThis allows us to provide a fine-grained perspective of diffusion models,\nbenefiting future studies in the analysis of individual components, the\napplicability of design choices, and the implementation of diffusion models.\n","authors":["Ziyi Chang","George Alex Koulieris","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2306.04542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06098v1","updated":"2023-08-11T12:18:53Z","published":"2023-08-11T12:18:53Z","title":"Automated Construction of Time-Space Diagrams for Traffic Analysis Using\n  Street-View Video Sequence","summary":"  Time-space diagrams are essential tools for analyzing traffic patterns and\noptimizing transportation infrastructure and traffic management strategies.\nTraditional data collection methods for these diagrams have limitations in\nterms of temporal and spatial coverage. Recent advancements in camera\ntechnology have overcome these limitations and provided extensive urban data.\nIn this study, we propose an innovative approach to constructing time-space\ndiagrams by utilizing street-view video sequences captured by cameras mounted\non moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and\nphotogrammetry techniques for distance calculation, we can infer vehicle\ntrajectories from the video data and generate time-space diagrams. To evaluate\nthe effectiveness of our proposed method, we utilized datasets from the KITTI\ncomputer vision benchmark suite. The evaluation results demonstrate that our\napproach can generate trajectories from video data, although there are some\nerrors that can be mitigated by improving the performance of the detector,\ntracker, and distance calculation components. In conclusion, the utilization of\nstreet-view video sequences captured by cameras mounted on moving vehicles,\ncombined with state-of-the-art computer vision techniques, has immense\npotential for constructing comprehensive time-space diagrams. These diagrams\noffer valuable insights into traffic patterns and contribute to the design of\ntransportation infrastructure and traffic management strategies.\n","authors":["Tanay Rastogi","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2308.06098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06097v1","updated":"2023-08-11T12:17:24Z","published":"2023-08-11T12:17:24Z","title":"RIGID: Recurrent GAN Inversion and Editing of Real Face Videos","summary":"  GAN inversion is indispensable for applying the powerful editability of GAN\nto real images. However, existing methods invert video frames individually\noften leading to undesired inconsistent results over time. In this paper, we\npropose a unified recurrent framework, named \\textbf{R}ecurrent v\\textbf{I}deo\n\\textbf{G}AN \\textbf{I}nversion and e\\textbf{D}iting (RIGID), to explicitly and\nsimultaneously enforce temporally coherent GAN inversion and facial editing of\nreal videos. Our approach models the temporal relations between current and\nprevious frames from three aspects. To enable a faithful real video\nreconstruction, we first maximize the inversion fidelity and consistency by\nlearning a temporal compensated latent code. Second, we observe incoherent\nnoises lie in the high-frequency domain that can be disentangled from the\nlatent space. Third, to remove the inconsistency after attribute manipulation,\nwe propose an \\textit{in-between frame composition constraint} such that the\narbitrary frame must be a direct composite of its neighboring frames. Our\nunified framework learns the inherent coherence between input frames in an\nend-to-end manner, and therefore it is agnostic to a specific attribute and can\nbe applied to arbitrary editing of the same video without re-training.\nExtensive experiments demonstrate that RIGID outperforms state-of-the-art\nmethods qualitatively and quantitatively in both inversion and editing tasks.\nThe deliverables can be found in \\url{https://cnnlstm.github.io/RIGID}\n","authors":["Yangyang Xu","Shengfeng He","Kwan-Yee K. Wong","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.06097v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/1903.10360v2","updated":"2023-08-11T12:08:58Z","published":"2019-03-25T14:21:08Z","title":"Structured 2D Representation of 3D Data for Shape Processing","summary":"  We represent 3D shape by structured 2D representations of fixed length making\nit feasible to apply well investigated 2D convolutional neural networks (CNN)\nfor both discriminative and geometric tasks on 3D shapes. We first provide a\ngeneral introduction to such structured descriptors, analyze their different\nforms and show how a simple 2D CNN can be used to achieve good classification\nresult. With a specialized classification network for images and our structured\nrepresentation, we achieve the classification accuracy of 99.7\\% in the\nModelNet40 test set - improving the previous state-of-the-art by a large\nmargin. We finally provide a novel framework for performing the geometric task\nof 3D segmentation using 2D CNNs and the structured representation - concluding\nthe utility of such descriptors for both discriminative and geometric tasks.\n","authors":["Kripasindhu Sarkar","Elizabeth Mathews","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/1903.10360v2.pdf","comment":"Results of some of the experiments were incorrect"},{"id":"http://arxiv.org/abs/2308.06093v1","updated":"2023-08-11T12:05:12Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n  Transformers","summary":"  Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.01732v4","updated":"2023-08-11T11:55:50Z","published":"2023-01-04T18:02:59Z","title":"UNAEN: Unsupervised Abnormality Extraction Network for MRI Motion\n  Artifact Reduction","summary":"  Motion artifacts compromise the quality of magnetic resonance imaging (MRI)\nand pose challenges to achieving diagnostic outcomes and image-guided\ntherapies. In recent years, supervised deep learning approaches have emerged as\nsuccessful solutions for motion artifact reduction (MAR). One disadvantage of\nthese methods is their dependency on acquiring paired sets of motion\nartifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images\nfor training purposes. Obtaining such image pairs is difficult and therefore\nlimits the application of supervised training. In this paper, we propose a\nnovel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this\nproblem. Our network is capable of working with unpaired MA-corrupted and\nMA-free images. It converts the MA-corrupted images to MA-reduced images by\nextracting abnormalities from the MA-corrupted images using a proposed artifact\nextractor, which intercepts the residual artifact maps from the MA-corrupted MR\nimages explicitly, and a reconstructor to restore the original input from the\nMA-reduced images. The performance of UNAEN was assessed by experimenting on\nvarious publicly available MRI datasets and comparing them with\nstate-of-the-art methods. The quantitative evaluation demonstrates the\nsuperiority of UNAEN over alternative MAR methods and visually exhibits fewer\nresidual artifacts. Our results substantiate the potential of UNAEN as a\npromising solution applicable in real-world clinical environments, with the\ncapability to enhance diagnostic accuracy and facilitate image-guided\ntherapies.\n","authors":["Yusheng Zhou","Hao Li","Jianan Liu","Zhengmin Kong","Tao Huang","Euijoon Ahn","Zhihan Lv","Jinman Kim","David Dagan Feng"],"pdf_url":"https://arxiv.org/pdf/2301.01732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06076v1","updated":"2023-08-11T11:29:01Z","published":"2023-08-11T11:29:01Z","title":"Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD\n  Space","summary":"  Creating realistic 3D facial animation is crucial for various applications in\nthe movie production and gaming industry, especially with the burgeoning demand\nin the metaverse. However, prevalent methods such as blendshape-based\napproaches and facial rigging techniques are time-consuming, labor-intensive,\nand lack standardized configurations, making facial animation production\nchallenging and costly. In this paper, we propose a novel self-supervised\nframework, Versatile Face Animator, which combines facial motion capture with\nmotion retargeting in an end-to-end manner, eliminating the need for\nblendshapes or rigs. Our method has the following two main characteristics: 1)\nwe propose an RGBD animation module to learn facial motion from raw RGBD videos\nby hierarchical motion dictionaries and animate RGBD images rendered from 3D\nfacial mesh coarse-to-fine, enabling facial animation on arbitrary 3D\ncharacters regardless of their topology, textures, blendshapes, and rigs; and\n2) we introduce a mesh retarget module to utilize RGBD animation to create 3D\nfacial animation by manipulating facial mesh with controller transformations,\nwhich are estimated from dense optical flow fields and blended together with\ngeodesic-distance-based weights. Comprehensive experiments demonstrate the\neffectiveness of our proposed framework in generating impressive 3D facial\nanimation results, highlighting its potential as a promising solution for the\ncost-effective and efficient production of facial animation in the metaverse.\n","authors":["Haoyu Wang","Haozhe Wu","Junliang Xing","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2308.06076v1.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.06072v1","updated":"2023-08-11T11:25:23Z","published":"2023-08-11T11:25:23Z","title":"Out-of-Distribution Detection for Monocular Depth Estimation","summary":"  In monocular depth estimation, uncertainty estimation approaches mainly\ntarget the data uncertainty introduced by image noise. In contrast to prior\nwork, we address the uncertainty due to lack of knowledge, which is relevant\nfor the detection of data not represented by the training distribution, the\nso-called out-of-distribution (OOD) data. Motivated by anomaly detection, we\npropose to detect OOD images from an encoder-decoder depth estimation model\nbased on the reconstruction error. Given the features extracted with the fixed\ndepth encoder, we train an image decoder for image reconstruction using only\nin-distribution data. Consequently, OOD images result in a high reconstruction\nerror, which we use to distinguish between in- and out-of-distribution samples.\nWe built our experiments on the standard NYU Depth V2 and KITTI benchmarks as\nin-distribution data. Our post hoc method performs astonishingly well on\ndifferent models and outperforms existing uncertainty estimation approaches\nwithout modifying the trained encoder-decoder depth estimation model.\n","authors":["Julia Hornauer","Adrian Holzbock","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2308.06072v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2306.02898v3","updated":"2023-08-11T11:13:08Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n  Multi-Attribute and Language Search Benchmark","summary":"  In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15157v3","updated":"2023-08-11T11:06:09Z","published":"2022-06-30T09:40:05Z","title":"HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object\n  Detection","summary":"  Besides standard cameras, autonomous vehicles typically include multiple\nadditional sensors, such as lidars and radars, which help acquire richer\ninformation for perceiving the content of the driving scene. While several\nrecent works focus on fusing certain pairs of sensors - such as camera with\nlidar or radar - by using architectural components specific to the examined\nsetting, a generic and modular sensor fusion architecture is missing from the\nliterature. In this work, we propose HRFuser, a modular architecture for\nmulti-modal 2D object detection. It fuses multiple sensors in a\nmulti-resolution fashion and scales to an arbitrary number of input modalities.\nThe design of HRFuser is based on state-of-the-art high-resolution networks for\nimage-only dense prediction and incorporates a novel multi-window\ncross-attention block as the means to perform fusion of multiple modalities at\nmultiple resolutions. We demonstrate via extensive experiments on nuScenes and\nthe adverse conditions DENSE datasets that our model effectively leverages\ncomplementary features from additional modalities, substantially improving upon\ncamera-only performance and consistently outperforming state-of-the-art 3D and\n2D fusion methods evaluated on 2D object detection metrics. The source code is\npublicly available.\n","authors":["Tim Broedermann","Christos Sakaridis","Dengxin Dai","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2206.15157v3.pdf","comment":"IEEE International Conference on Intelligent Transportation Systems\n  (ITSC) 2023"},{"id":"http://arxiv.org/abs/2306.14628v2","updated":"2023-08-11T10:17:04Z","published":"2023-06-26T12:06:20Z","title":"An Integral Projection-based Semantic Autoencoder for Zero-Shot Learning","summary":"  Zero-shot Learning (ZSL) classification categorizes or predicts classes\n(labels) that are not included in the training set (unseen classes). Recent\nworks proposed different semantic autoencoder (SAE) models where the encoder\nembeds a visual feature vector space into the semantic space and the decoder\nreconstructs the original visual feature space. The objective is to learn the\nembedding by leveraging a source data distribution, which can be applied\neffectively to a different but related target data distribution. Such\nembedding-based methods are prone to domain shift problems and are vulnerable\nto biases. We propose an integral projection-based semantic autoencoder\n(IP-SAE) where an encoder projects a visual feature space concatenated with the\nsemantic space into a latent representation space. We force the decoder to\nreconstruct the visual-semantic data space. Due to this constraint, the\nvisual-semantic projection function preserves the discriminatory data included\ninside the original visual feature space. The enriched projection forces a more\nprecise reconstitution of the visual feature space invariant to the domain\nmanifold. Consequently, the learned projection function is less domain-specific\nand alleviates the domain shift problem. Our proposed IP-SAE model consolidates\na symmetric transformation function for embedding and projection, and thus, it\nprovides transparency for interpreting generative applications in ZSL.\nTherefore, in addition to outperforming state-of-the-art methods considering\nfour benchmark datasets, our analytical approach allows us to investigate\ndistinct characteristics of generative-based methods in the unique context of\nzero-shot inference.\n","authors":["William Heyden","Habib Ullah","M. Salman Siddiqui","Fadi Al Machot"],"pdf_url":"https://arxiv.org/pdf/2306.14628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06057v1","updated":"2023-08-11T10:14:22Z","published":"2023-08-11T10:14:22Z","title":"Head Rotation in Denoising Diffusion Models","summary":"  Denoising Diffusion Models (DDM) are emerging as the cutting-edge technology\nin the realm of deep generative modeling, challenging the dominance of\nGenerative Adversarial Networks. However, effectively exploring the latent\nspace's semantics and identifying compelling trajectories for manipulating and\nediting important attributes of the generated samples remains challenging,\nprimarily due to the high-dimensional nature of the latent space. In this\nstudy, we specifically concentrate on face rotation, which is known to be one\nof the most intricate editing operations. By leveraging a recent embedding\ntechnique for Denoising Diffusion Implicit Models (DDIM), we achieve, in many\ncases, noteworthy manipulations encompassing a wide rotation angle of $\\pm\n30^o$, preserving the distinct characteristics of the individual. Our\nmethodology exploits the computation of trajectories approximating clouds of\nlatent representations of dataset samples with different yaw rotations through\nlinear regression. Specific trajectories are obtained by restricting the\nanalysis to subsets of data sharing significant attributes with the source\nimage. One of these attributes is the light provenance: a byproduct of our\nresearch is a labeling of CelebA, categorizing images into three major groups\nbased on the illumination direction: left, center, and right.\n","authors":["Andrea Asperti","Gabriele Colasuonno","Antonio Guerra"],"pdf_url":"https://arxiv.org/pdf/2308.06057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06055v1","updated":"2023-08-11T10:09:08Z","published":"2023-08-11T10:09:08Z","title":"Computer-Aided Cytology Diagnosis in Animals: CNN-Based Image Quality\n  Assessment for Accurate Disease Classification","summary":"  This paper presents a computer-aided cytology diagnosis system designed for\nanimals, focusing on image quality assessment (IQA) using Convolutional Neural\nNetworks (CNNs). The system's building blocks are tailored to seamlessly\nintegrate IQA, ensuring reliable performance in disease classification. We\nextensively investigate the CNN's ability to handle various image variations\nand scenarios, analyzing the impact on detecting low-quality input data.\nAdditionally, the network's capacity to differentiate valid cellular samples\nfrom those with artifacts is evaluated. Our study employs a ResNet18 network\narchitecture and explores the effects of input sizes and cropping strategies on\nmodel performance. The research sheds light on the significance of CNN-based\nIQA in computer-aided cytology diagnosis for animals, enhancing the accuracy of\ndisease classification.\n","authors":["Jan Krupiński","Maciej Wielgosz","Szymon Mazurek","Krystian Strzałka","Paweł Russek","Jakub Caputa","Daria Łukasik","Jakub Grzeszczyk","Michał Karwatowski","Rafał Fraczek","Ernest Jamro","Marcin Pietroń","Sebastian Koryciak","Agnieszka Dąbrowska-Boruch","Kazimierz Wiatr"],"pdf_url":"https://arxiv.org/pdf/2308.06055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00464v2","updated":"2023-08-11T10:08:46Z","published":"2023-07-02T03:24:58Z","title":"Human-to-Human Interaction Detection","summary":"  A comprehensive understanding of interested human-to-human interactions in\nvideo streams, such as queuing, handshaking, fighting and chasing, is of\nimmense importance to the surveillance of public security in regions like\ncampuses, squares and parks. Different from conventional human interaction\nrecognition, which uses choreographed videos as inputs, neglects concurrent\ninteractive groups, and performs detection and recognition in separate stages,\nwe introduce a new task named human-to-human interaction detection (HID). HID\ndevotes to detecting subjects, recognizing person-wise actions, and grouping\npeople according to their interactive relations, in one model. First, based on\nthe popular AVA dataset created for action detection, we establish a new HID\nbenchmark, termed AVA-Interaction (AVA-I), by adding annotations on interactive\nrelations in a frame-by-frame manner. AVA-I consists of 85,254 frames and\n86,338 interactive groups, and each image includes up to 4 concurrent\ninteractive groups. Second, we present a novel baseline approach SaMFormer for\nHID, containing a visual feature extractor, a split stage which leverages a\nTransformer-based model to decode action instances and interactive groups, and\na merging stage which reconstructs the relationship between instances and\ngroups. All SaMFormer components are jointly trained in an end-to-end manner.\nExtensive experiments on AVA-I validate the superiority of SaMFormer over\nrepresentative methods. The dataset and code will be made public to encourage\nmore follow-up studies.\n","authors":["Zhenhua Wang","Kaining Ying","Jiajun Meng","Jifeng Ning"],"pdf_url":"https://arxiv.org/pdf/2307.00464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06054v1","updated":"2023-08-11T10:07:33Z","published":"2023-08-11T10:07:33Z","title":"Hardware Accelerators in Autonomous Driving","summary":"  Computing platforms in autonomous vehicles record large amounts of data from\nmany sensors, process the data through machine learning models, and make\ndecisions to ensure the vehicle's safe operation. Fast, accurate, and reliable\ndecision-making is critical. Traditional computer processors lack the power and\nflexibility needed for the perception and machine vision demands of advanced\nautonomous driving tasks. Hardware accelerators are special-purpose\ncoprocessors that help autonomous vehicles meet performance requirements for\nhigher levels of autonomy. This paper provides an overview of ML accelerators\nwith examples of their use for machine vision in autonomous vehicles. We offer\nrecommendations for researchers and practitioners and highlight a trajectory\nfor ongoing and future research in this emerging field.\n","authors":["Ken Power","Shailendra Deva","Ting Wang","Julius Li","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2308.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06051v1","updated":"2023-08-11T09:58:47Z","published":"2023-08-11T09:58:47Z","title":"Towards Instance-adaptive Inference for Federated Learning","summary":"  Federated learning (FL) is a distributed learning paradigm that enables\nmultiple clients to learn a powerful global model by aggregating local\ntraining. However, the performance of the global model is often hampered by\nnon-i.i.d. distribution among the clients, requiring extensive efforts to\nmitigate inter-client data heterogeneity. Going beyond inter-client data\nheterogeneity, we note that intra-client heterogeneity can also be observed on\ncomplex real-world data and seriously deteriorate FL performance. In this\npaper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client\ndata heterogeneity by enabling instance-adaptive inference in the FL framework.\nInstead of huge instance-adaptive models, we resort to a parameter-efficient\nfine-tuning method, i.e., scale and shift deep features (SSF), upon a\npre-trained model. Specifically, we first train an SSF pool for each client,\nand aggregate these SSF pools on the server side, thus still maintaining a low\ncommunication cost. To enable instance-adaptive inference, for a given\ninstance, we dynamically find the best-matched SSF subsets from the pool and\naggregate them to generate an adaptive SSF specified for the instance, thereby\nreducing the intra-client as well as the inter-client heterogeneity. Extensive\nexperiments show that our FedIns outperforms state-of-the-art FL algorithms,\ne.g., a 6.64\\% improvement against the top-performing method with less than\n15\\% communication cost on Tiny-ImageNet. Our code and models will be publicly\nreleased.\n","authors":["Chun-Mei Feng","Kai Yu","Nian Liu","Xinxing Xu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13031v2","updated":"2023-08-11T09:46:37Z","published":"2023-04-25T17:59:54Z","title":"DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection","summary":"  In this paper, we study the problem of semi-supervised 3D object detection,\nwhich is of great importance considering the high annotation cost for cluttered\n3D indoor scenes. We resort to the robust and principled framework of\nselfteaching, which has triggered notable progress for semisupervised learning\nrecently. While this paradigm is natural for image-level or pixel-level\nprediction, adapting it to the detection problem is challenged by the issue of\nproposal matching. Prior methods are based upon two-stage pipelines, matching\nheuristically selected proposals generated in the first stage and resulting in\nspatially sparse training signals. In contrast, we propose the first\nsemisupervised 3D detection algorithm that works in the singlestage manner and\nallows spatially dense training signals. A fundamental issue of this new design\nis the quantization error caused by point-to-voxel discretization, which\ninevitably leads to misalignment between two transformed views in the voxel\ndomain. To this end, we derive and implement closed-form rules that compensate\nthis misalignment onthe-fly. Our results are significant, e.g., promoting\nScanNet mAP@0.5 from 35.2% to 48.5% using 20% annotation. Codes and data will\nbe publicly available.\n","authors":["Huan-ang Gao","Beiwen Tian","Pengfei Li","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.13031v2.pdf","comment":"Accepted to ICCV 2023. Code: https://github.com/AIR-DISCOVER/DQS3D"},{"id":"http://arxiv.org/abs/2303.11681v2","updated":"2023-08-11T09:44:04Z","published":"2023-03-21T08:43:15Z","title":"DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic\n  Segmentation Using Diffusion Models","summary":"  Collecting and annotating images with pixel-wise labels is time-consuming and\nlaborious. In contrast, synthetic data can be freely available using a\ngenerative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that\nit is possible to automatically obtain accurate semantic masks of synthetic\nimages generated by the Off-the-shelf Stable Diffusion model, which uses only\ntext-image pairs during training. Our approach, called DiffuMask, exploits the\npotential of the cross-attention map between text and image, which is natural\nand seamless to extend the text-driven image synthesis to semantic mask\ngeneration. DiffuMask uses text-guided cross-attention information to localize\nclass/word-specific regions, which are combined with practical techniques to\ncreate a novel high-resolution and class-discriminative pixel-wise mask. The\nmethods help to reduce data collection and annotation costs obviously.\nExperiments demonstrate that the existing segmentation methods trained on\nsynthetic data of DiffuMask can achieve a competitive performance over the\ncounterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),\nDiffuMask presents promising performance, close to the stateof-the-art result\nof real data (within 3% mIoU gap). Moreover, in the open-vocabulary\nsegmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on\nUnseen class of VOC 2012. The project website can be found at\nhttps://weijiawu.github.io/DiffusionMask/.\n","authors":["Weijia Wu","Yuzhong Zhao","Mike Zheng Shou","Hong Zhou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.11681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06038v1","updated":"2023-08-11T09:36:31Z","published":"2023-08-11T09:36:31Z","title":"Diverse Data Augmentation with Diffusions for Effective Test-time Prompt\n  Tuning","summary":"  Benefiting from prompt tuning, recent years have witnessed the promising\nperformance of pre-trained vision-language models, e.g., CLIP, on versatile\ndownstream tasks. In this paper, we focus on a particular setting of learning\nadaptive prompts on the fly for each test sample from an unseen new domain,\nwhich is known as test-time prompt tuning (TPT). Existing TPT methods typically\nrely on data augmentation and confidence selection. However, conventional data\naugmentation techniques, e.g., random resized crops, suffers from the lack of\ndata diversity, while entropy-based confidence selection alone is not\nsufficient to guarantee prediction fidelity. To address these issues, we\npropose a novel TPT method, named DiffTPT, which leverages pre-trained\ndiffusion models to generate diverse and informative new data. Specifically, we\nincorporate augmented data by both conventional method and pre-trained stable\ndiffusion to exploit their respective merits, improving the models ability to\nadapt to unknown new test data. Moreover, to ensure the prediction fidelity of\ngenerated data, we introduce a cosine similarity-based filtration technique to\nselect the generated data with higher similarity to the single test sample. Our\nexperiments on test datasets with distribution shifts and unseen categories\ndemonstrate that DiffTPT improves the zero-shot accuracy by an average of\n5.13\\% compared to the state-of-the-art TPT method. Our code and models will be\npublicly released.\n","authors":["Chun-Mei Feng","Kai Yu","Yong Liu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06038v1.pdf","comment":"8 pages,9 figures"},{"id":"http://arxiv.org/abs/2204.08247v3","updated":"2023-08-11T09:30:41Z","published":"2022-04-18T10:50:03Z","title":"Joint Multi-view Unsupervised Feature Selection and Graph Learning","summary":"  Despite significant progress, previous multi-view unsupervised feature\nselection methods mostly suffer from two limitations. First, they generally\nutilize either cluster structure or similarity structure to guide the feature\nselection, which neglect the possibility of a joint formulation with mutual\nbenefits. Second, they often learn the similarity structure by either global\nstructure learning or local structure learning, which lack the capability of\ngraph learning with both global and local structural awareness. In light of\nthis, this paper presents a joint multi-view unsupervised feature selection and\ngraph learning (JMVFG) approach. Particularly, we formulate the multi-view\nfeature selection with orthogonal decomposition, where each target matrix is\ndecomposed into a view-specific basis matrix and a view-consistent cluster\nindicator. The cross-space locality preservation is incorporated to bridge the\ncluster structure learning in the projected space and the similarity learning\n(i.e., graph learning) in the original space. Further, a unified objective\nfunction is presented to enable the simultaneous learning of the cluster\nstructure, the global and local similarity structures, and the multi-view\nconsistency and inconsistency, upon which an alternating optimization algorithm\nis developed with theoretically proved convergence. Extensive experiments on a\nvariety of real-world multi-view datasets demonstrate the superiority of our\napproach for both the multi-view feature selection and graph learning tasks.\nThe code is available at https://github.com/huangdonghere/JMVFG.\n","authors":["Si-Guo Fang","Dong Huang","Chang-Dong Wang","Yong Tang"],"pdf_url":"https://arxiv.org/pdf/2204.08247v3.pdf","comment":"To appear in IEEE Transactions on Emerging Topics in Computational\n  Intelligence"},{"id":"http://arxiv.org/abs/2204.01558v2","updated":"2023-08-11T09:20:55Z","published":"2022-04-04T15:05:45Z","title":"Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning\n  Consistent and Contrastive Feature Representations","summary":"  In this work, we present Con$^{2}$DA, a simple framework that extends recent\nadvances in semi-supervised learning to the semi-supervised domain adaptation\n(SSDA) problem. Our framework generates pairs of associated samples by\nperforming stochastic data transformations to a given input. Associated data\npairs are mapped to a feature representation space using a feature extractor.\nWe use different loss functions to enforce consistency between the feature\nrepresentations of associated data pairs of samples. We show that these learned\nrepresentations are useful to deal with differences in data distributions in\nthe domain adaptation problem. We performed experiments to study the main\ncomponents of our model and we show that (i) learning of the consistent and\ncontrastive feature representations is crucial to extract good discriminative\nfeatures across different domains, and ii) our model benefits from the use of\nstrong augmentation policies. With these findings, our method achieves\nstate-of-the-art performances in three benchmark datasets for SSDA.\n","authors":["Manuel Pérez-Carrasco","Pavlos Protopapas","Guillermo Cabrera-Vives"],"pdf_url":"https://arxiv.org/pdf/2204.01558v2.pdf","comment":"Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting\n  Methods and Applications"},{"id":"http://arxiv.org/abs/2307.03903v3","updated":"2023-08-11T09:15:27Z","published":"2023-07-08T05:03:10Z","title":"Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for\n  Visible-Infrared Video Person Re-Identification","summary":"  In visible-infrared video person re-identification (re-ID), extracting\nfeatures not affected by complex scenes (such as modality, camera views,\npedestrian pose, background, etc.) changes, and mining and utilizing motion\ninformation are the keys to solving cross-modal pedestrian identity matching.\nTo this end, the paper proposes a new visible-infrared video person re-ID\nmethod from a novel perspective, i.e., adversarial self-attack defense and\nspatial-temporal relation mining. In this work, the changes of views, posture,\nbackground and modal discrepancy are considered as the main factors that cause\nthe perturbations of person identity features. Such interference information\ncontained in the training samples is used as an adversarial perturbation. It\nperforms adversarial attacks on the re-ID model during the training to make the\nmodel more robust to these unfavorable factors. The attack from the adversarial\nperturbation is introduced by activating the interference information contained\nin the input samples without generating adversarial samples, and it can be thus\ncalled adversarial self-attack. This design allows adversarial attack and\ndefense to be integrated into one framework. This paper further proposes a\nspatial-temporal information-guided feature representation network to use the\ninformation in video sequences. The network cannot only extract the information\ncontained in the video-frame sequences but also use the relation of the local\ninformation in space to guide the network to extract more robust features. The\nproposed method exhibits compelling performance on large-scale cross-modality\nvideo datasets. The source code of the proposed method will be released at\nhttps://github.com/lhf12278/xxx.\n","authors":["Huafeng Li","Le Xu","Yafei Zhang","Dapeng Tao","Zhengtao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03903v3.pdf","comment":"11 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.06027v1","updated":"2023-08-11T09:15:22Z","published":"2023-08-11T09:15:22Z","title":"Masked-Attention Diffusion Guidance for Spatially Controlling\n  Text-to-Image Generation","summary":"  Text-to-image synthesis has achieved high-quality results with recent\nadvances in diffusion models. However, text input alone has high spatial\nambiguity and limited user controllability. Most existing methods allow spatial\ncontrol through additional visual guidance (e.g, sketches and semantic masks)\nbut require additional training with annotated images. In this paper, we\npropose a method for spatially controlling text-to-image generation without\nfurther training of diffusion models. Our method is based on the insight that\nthe cross-attention maps reflect the positional relationship between words and\npixels. Our aim is to control the attention maps according to given semantic\nmasks and text prompts. To this end, we first explore a simple approach of\ndirectly swapping the cross-attention maps with constant maps computed from the\nsemantic regions. Moreover, we propose masked-attention guidance, which can\ngenerate images more faithful to semantic masks than the first approach.\nMasked-attention guidance indirectly controls attention to each word and pixel\naccording to the semantic regions by manipulating noise images fed to diffusion\nmodels. Experiments show that our method enables more accurate spatial control\nthan baselines qualitatively and quantitatively.\n","authors":["Yuki Endo"],"pdf_url":"https://arxiv.org/pdf/2308.06027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04531v3","updated":"2023-08-11T09:11:59Z","published":"2022-06-09T14:25:23Z","title":"ECLAD: Extracting Concepts with Local Aggregated Descriptors","summary":"  Convolutional neural networks (CNNs) are increasingly being used in critical\nsystems, where robustness and alignment are crucial. In this context, the field\nof explainable artificial intelligence has proposed the generation of\nhigh-level explanations of the prediction process of CNNs through concept\nextraction. While these methods can detect whether or not a concept is present\nin an image, they are unable to determine its location. What is more, a fair\ncomparison of such approaches is difficult due to a lack of proper validation\nprocedures. To address these issues, we propose a novel method for automatic\nconcept extraction and localization based on representations obtained through\npixel-wise aggregations of CNN activation maps. Further, we introduce a process\nfor the validation of concept-extraction techniques based on synthetic datasets\nwith pixel-wise annotations of their main components, reducing the need for\nhuman intervention. Extensive experimentation on both synthetic and real-world\ndatasets demonstrates that our method outperforms state-of-the-art\nalternatives.\n","authors":["Andres Felipe Posada-Moreno","Nikita Surya","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2206.04531v3.pdf","comment":"34 pages, under review"},{"id":"http://arxiv.org/abs/2308.06024v1","updated":"2023-08-11T09:02:03Z","published":"2023-08-11T09:02:03Z","title":"Spatial-information Guided Adaptive Context-aware Network for Efficient\n  RGB-D Semantic Segmentation","summary":"  Efficient RGB-D semantic segmentation has received considerable attention in\nmobile robots, which plays a vital role in analyzing and recognizing\nenvironmental information. According to previous studies, depth information can\nprovide corresponding geometric relationships for objects and scenes, but\nactual depth data usually exist as noise. To avoid unfavorable effects on\nsegmentation accuracy and computation, it is necessary to design an efficient\nframework to leverage cross-modal correlations and complementary cues. In this\npaper, we propose an efficient lightweight encoder-decoder network that reduces\nthe computational parameters and guarantees the robustness of the algorithm.\nWorking with channel and spatial fusion attention modules, our network\neffectively captures multi-level RGB-D features. A globally guided local\naffinity context module is proposed to obtain sufficient high-level context\ninformation. The decoder utilizes a lightweight residual unit that combines\nshort- and long-distance information with a few redundant computations.\nExperimental results on NYUv2, SUN RGB-D, and Cityscapes datasets show that our\nmethod achieves a better trade-off among segmentation accuracy, inference time,\nand parameters than the state-of-the-art methods. The source code will be at\nhttps://github.com/MVME-HBUT/SGACNet\n","authors":["Yang Zhang","Chenyun Xiong","Junjie Liu","Xuhui Ye","Guodong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06024v1.pdf","comment":"Accepted by IEEE Sensors Journal"},{"id":"http://arxiv.org/abs/2308.06022v1","updated":"2023-08-11T08:54:45Z","published":"2023-08-11T08:54:45Z","title":"Scale-Preserving Automatic Concept Extraction (SPACE)","summary":"  Convolutional Neural Networks (CNN) have become a common choice for\nindustrial quality control, as well as other critical applications in the\nIndustry 4.0. When these CNNs behave in ways unexpected to human users or\ndevelopers, severe consequences can arise, such as economic losses or an\nincreased risk to human life. Concept extraction techniques can be applied to\nincrease the reliability and transparency of CNNs through generating global\nexplanations for trained neural network models. The decisive features of image\ndatasets in quality control often depend on the feature's scale; for example,\nthe size of a hole or an edge. However, existing concept extraction methods do\nnot correctly represent scale, which leads to problems interpreting these\nmodels as we show herein. To address this issue, we introduce the\nScale-Preserving Automatic Concept Extraction (SPACE) algorithm, as a\nstate-of-the-art alternative concept extraction technique for CNNs, focused on\nindustrial applications. SPACE is specifically designed to overcome the\naforementioned problems by avoiding scale changes throughout the concept\nextraction process. SPACE proposes an approach based on square slices of input\nimages, which are selected and then tiled before being clustered into concepts.\nOur method provides explanations of the models' decision-making process in the\nform of human-understandable concepts. We evaluate SPACE on three image\nclassification datasets in the context of industrial quality control. Through\nexperimental results, we illustrate how SPACE outperforms other methods and\nprovides actionable insights on the decision mechanisms of CNNs. Finally, code\nfor the implementation of SPACE is provided.\n","authors":["Andrés Felipe Posada-Moreno","Lukas Kreisköther","Tassilo Glander","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2308.06022v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.06015v1","updated":"2023-08-11T08:44:58Z","published":"2023-08-11T08:44:58Z","title":"Enhancing Generalization of Universal Adversarial Perturbation through\n  Gradient Aggregation","summary":"  Deep neural networks are vulnerable to universal adversarial perturbation\n(UAP), an instance-agnostic perturbation capable of fooling the target model\nfor most samples. Compared to instance-specific adversarial examples, UAP is\nmore challenging as it needs to generalize across various samples and models.\nIn this paper, we examine the serious dilemma of UAP generation methods from a\ngeneralization perspective -- the gradient vanishing problem using small-batch\nstochastic gradient optimization and the local optima problem using large-batch\noptimization. To address these problems, we propose a simple and effective\nmethod called Stochastic Gradient Aggregation (SGA), which alleviates the\ngradient vanishing and escapes from poor local optima at the same time.\nSpecifically, SGA employs the small-batch training to perform multiple\niterations of inner pre-search. Then, all the inner gradients are aggregated as\na one-step gradient estimation to enhance the gradient stability and reduce\nquantization errors. Extensive experiments on the standard ImageNet dataset\ndemonstrate that our method significantly enhances the generalization ability\nof UAP and outperforms other state-of-the-art methods. The code is available at\nhttps://github.com/liuxuannan/Stochastic-Gradient-Aggregation.\n","authors":["Xuannan Liu","Yaoyao Zhong","Yuhang Zhang","Lixiong Qin","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2308.06015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17271v2","updated":"2023-08-11T08:35:06Z","published":"2023-05-26T21:36:08Z","title":"Robust Lane Detection through Self Pre-training with Masked Sequential\n  Autoencoders and Fine-tuning with Customized PolyLoss","summary":"  Lane detection is crucial for vehicle localization which makes it the\nfoundation for automated driving and many intelligent and advanced driving\nassistant systems. Available vision-based lane detection methods do not make\nfull use of the valuable features and aggregate contextual information,\nespecially the interrelationships between lane lines and other regions of the\nimages in continuous frames. To fill this research gap and upgrade lane\ndetection performance, this paper proposes a pipeline consisting of self\npre-training with masked sequential autoencoders and fine-tuning with\ncustomized PolyLoss for the end-to-end neural network models using\nmulti-continuous image frames. The masked sequential autoencoders are adopted\nto pre-train the neural network models with reconstructing the missing pixels\nfrom a random masked image as the objective. Then, in the fine-tuning\nsegmentation phase where lane detection segmentation is performed, the\ncontinuous image frames are served as the inputs, and the pre-trained model\nweights are transferred and further updated using the backpropagation mechanism\nwith customized PolyLoss calculating the weighted errors between the output\nlane detection results and the labeled ground truth. Extensive experiment\nresults demonstrate that, with the proposed pipeline, the lane detection model\nperformance on both normal and challenging scenes can be advanced beyond the\nstate-of-the-art, delivering the best testing accuracy (98.38%), precision\n(0.937), and F1-measure (0.924) on the normal scene testing set, together with\nthe best overall accuracy (98.36%) and precision (0.844) in the challenging\nscene test set, while the training time can be substantially shortened.\n","authors":["Ruohan Li","Yongqi Dong"],"pdf_url":"https://arxiv.org/pdf/2305.17271v2.pdf","comment":"12 pages, 8 figures, accepted by journal of IEEE Transactions on\n  Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2308.06009v1","updated":"2023-08-11T08:30:08Z","published":"2023-08-11T08:30:08Z","title":"ViGT: Proposal-free Video Grounding with Learnable Token in Transformer","summary":"  The video grounding (VG) task aims to locate the queried action or event in\nan untrimmed video based on rich linguistic descriptions. Existing\nproposal-free methods are trapped in complex interaction between video and\nquery, overemphasizing cross-modal feature fusion and feature correlation for\nVG. In this paper, we propose a novel boundary regression paradigm that\nperforms regression token learning in a transformer. Particularly, we present a\nsimple but effective proposal-free framework, namely Video Grounding\nTransformer (ViGT), which predicts the temporal boundary using a learnable\nregression token rather than multi-modal or cross-modal features. In ViGT, the\nbenefits of a learnable token are manifested as follows. (1) The token is\nunrelated to the video or the query and avoids data bias toward the original\nvideo and query. (2) The token simultaneously performs global context\naggregation from video and query features. First, we employed a sharing feature\nencoder to project both video and query into a joint feature space before\nperforming cross-modal co-attention (i.e., video-to-query attention and\nquery-to-video attention) to highlight discriminative features in each\nmodality. Furthermore, we concatenated a learnable regression token [REG] with\nthe video and query features as the input of a vision-language transformer.\nFinally, we utilized the token [REG] to predict the target moment and visual\nfeatures to constrain the foreground and background probabilities at each\ntimestamp. The proposed ViGT performed well on three public datasets: ANet\nCaptions, TACoS and YouCookII. Extensive ablation studies and qualitative\nanalysis further validated the interpretability of ViGT.\n","authors":["Kun Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06009v1.pdf","comment":"This paper has been accepted by SCIENCE CHINA Information Sciences"},{"id":"http://arxiv.org/abs/2308.05993v1","updated":"2023-08-11T08:00:30Z","published":"2023-08-11T08:00:30Z","title":"Image-based Geolocalization by Ground-to-2.5D Map Matching","summary":"  We study the image-based geolocalization problem that aims to locate\nground-view query images on cartographic maps. Previous methods often utilize\ncross-view localization techniques to match ground-view query images with 2D\nmaps. However, the performance of these methods is frequently unsatisfactory\ndue to the significant cross-view appearance differences. In this paper, we\nextend cross-view matching to 2.5D spaces, where the heights of the structures\n- such as trees, buildings, and other objects - can provide additional\ninformation to guide the cross-view matching. We present a new approach to\nlearning representative embeddings from multi-model data. Specifically, we\nfirst align 2D maps to ground-view panoramic images with polar transform to\nreduce the gap between panoramic images and maps. Then we leverage global\nfusion to fuse the multi-modal features from 2D and 2.5D maps to increase the\ndistinctiveness of location embeddings. We construct the first large-scale\nground-to-2.5D map geolocalization dataset to validate our method and\nfacilitate the research. We test our learned embeddings on two popular\nlocalization approaches, i.e., single-image based localization, and route based\nlocalization. Extensive experiments demonstrate that our proposed method\nachieves significantly higher localization accuracy and faster convergence than\nprevious 2D map-based approaches.\n","authors":["Mengjie Zhou","Liu Liu","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.05993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05991v1","updated":"2023-08-11T07:57:17Z","published":"2023-08-11T07:57:17Z","title":"Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection","summary":"  Recent progress in weakly supervised object detection is featured by a\ncombination of multiple instance detection networks (MIDN) and ordinal online\nrefinement. However, with only image-level annotation, MIDN inevitably assigns\nhigh scores to some unexpected region proposals when generating pseudo labels.\nThese inaccurate high-scoring region proposals will mislead the training of\nsubsequent refinement modules and thus hamper the detection performance. In\nthis work, we explore how to ameliorate the quality of pseudo-labeling in MIDN.\nFormally, we devise Cyclic-Bootstrap Labeling (CBL), a novel weakly supervised\nobject detection pipeline, which optimizes MIDN with rank information from a\nreliable teacher network. Specifically, we obtain this teacher network by\nintroducing a weighted exponential moving average strategy to take advantage of\nvarious refinement modules. A novel class-specific ranking distillation\nalgorithm is proposed to leverage the output of weighted ensembled teacher\nnetwork for distilling MIDN with rank information. As a result, MIDN is guided\nto assign higher scores to accurate proposals among their neighboring ones,\nthus benefiting the subsequent pseudo labeling. Extensive experiments on the\nprevalent PASCAL VOC 2007 \\& 2012 and COCO datasets demonstrate the superior\nperformance of our CBL framework. Code will be available at\nhttps://github.com/Yinyf0804/WSOD-CBL/.\n","authors":["Yufei Yin","Jiajun Deng","Wengang Zhou","Li Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.05991v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05988v1","updated":"2023-08-11T07:56:10Z","published":"2023-08-11T07:56:10Z","title":"MS3D++: Ensemble of Experts for Multi-Source Unsupervised Domain\n  Adaption in 3D Object Detection","summary":"  Deploying 3D detectors in unfamiliar domains has been demonstrated to result\nin a drastic drop of up to 70-90% in detection rate due to variations in lidar,\ngeographical region, or weather conditions from their original training\ndataset. This domain gap leads to missing detections for densely observed\nobjects, misaligned confidence scores, and increased high-confidence false\npositives, rendering the detector highly unreliable. To address this, we\nintroduce MS3D++, a self-training framework for multi-source unsupervised\ndomain adaptation in 3D object detection. MS3D++ provides a straightforward\napproach to domain adaptation by generating high-quality pseudo-labels,\nenabling the adaptation of 3D detectors to a diverse range of lidar types,\nregardless of their density. Our approach effectively fuses predictions of an\nensemble of multi-frame pre-trained detectors from different source domains to\nimprove domain generalization. We subsequently refine the predictions\ntemporally to ensure temporal consistency in box localization and object\nclassification. Furthermore, we present an in-depth study into the performance\nand idiosyncrasies of various 3D detector components in a cross-domain context,\nproviding valuable insights for improved cross-domain detector ensembling.\nExperimental results on Waymo, nuScenes and Lyft demonstrate that detectors\ntrained with MS3D++ pseudo-labels achieve state-of-the-art performance,\ncomparable to training with human-annotated labels in Bird's Eye View (BEV)\nevaluation for both low and high density lidar.\n","authors":["Darren Tsai","Julie Stephany Berrio","Mao Shan","Eduardo Nebot","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2308.05988v1.pdf","comment":"Code is available at https://github.com/darrenjkt/MS3D"},{"id":"http://arxiv.org/abs/2308.05983v1","updated":"2023-08-11T07:38:46Z","published":"2023-08-11T07:38:46Z","title":"Face Encryption via Frequency-Restricted Identity-Agnostic Attacks","summary":"  Billions of people are sharing their daily live images on social media\neveryday. However, malicious collectors use deep face recognition systems to\neasily steal their biometric information (e.g., faces) from these images. Some\nstudies are being conducted to generate encrypted face photos using adversarial\nattacks by introducing imperceptible perturbations to reduce face information\nleakage. However, existing studies need stronger black-box scenario feasibility\nand more natural visual appearances, which challenge the feasibility of privacy\nprotection. To address these problems, we propose a frequency-restricted\nidentity-agnostic (FRIA) framework to encrypt face images from unauthorized\nface recognition without access to personal information. As for the weak\nblack-box scenario feasibility, we obverse that representations of the average\nfeature in multiple face recognition models are similar, thus we propose to\nutilize the average feature via the crawled dataset from the Internet as the\ntarget to guide the generation, which is also agnostic to identities of unknown\nface recognition systems; in nature, the low-frequency perturbations are more\nvisually perceptible by the human vision system. Inspired by this, we restrict\nthe perturbation in the low-frequency facial regions by discrete cosine\ntransform to achieve the visual naturalness guarantee. Extensive experiments on\nseveral face recognition models demonstrate that our FRIA outperforms other\nstate-of-the-art methods in generating more natural encrypted faces while\nattaining high black-box attack success rates of 96%. In addition, we validate\nthe efficacy of FRIA using real-world black-box commercial API, which reveals\nthe potential of FRIA in practice. Our codes can be found in\nhttps://github.com/XinDong10/FRIA.\n","authors":["Xin Dong","Rui Wang","Siyuan Liang","Aishan Liu","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2308.05983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05976v1","updated":"2023-08-11T07:20:24Z","published":"2023-08-11T07:20:24Z","title":"Zero-shot Text-driven Physically Interpretable Face Editing","summary":"  This paper proposes a novel and physically interpretable method for face\nediting based on arbitrary text prompts. Different from previous\nGAN-inversion-based face editing methods that manipulate the latent space of\nGANs, or diffusion-based methods that model image manipulation as a reverse\ndiffusion process, we regard the face editing process as imposing vector flow\nfields on face images, representing the offset of spatial coordinates and color\nfor each image pixel. Under the above-proposed paradigm, we represent the\nvector flow field in two ways: 1) explicitly represent the flow vectors with\nrasterized tensors, and 2) implicitly parameterize the flow vectors as\ncontinuous, smooth, and resolution-agnostic neural fields, by leveraging the\nrecent advances of implicit neural representations. The flow vectors are\niteratively optimized under the guidance of the pre-trained Contrastive\nLanguage-Image Pretraining~(CLIP) model by maximizing the correlation between\nthe edited image and the text prompt. We also propose a learning-based one-shot\nface editing framework, which is fast and adaptable to any text prompt input.\nOur method can also be flexibly extended to real-time video face editing.\nCompared with state-of-the-art text-driven face editing methods, our method can\ngenerate physically interpretable face editing results with high identity\nconsistency and image quality. Our code will be made publicly available.\n","authors":["Yapeng Meng","Songru Yang","Xu Hu","Rui Zhao","Lincheng Li","Zhenwei Shi","Zhengxia Zou"],"pdf_url":"https://arxiv.org/pdf/2308.05976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05970v1","updated":"2023-08-11T07:07:40Z","published":"2023-08-11T07:07:40Z","title":"Focused Specific Objects NeRF","summary":"  Most NeRF-based models are designed for learning the entire scene, and\ncomplex scenes can lead to longer learning times and poorer rendering effects.\nThis paper utilizes scene semantic priors to make improvements in fast\ntraining, allowing the network to focus on the specific targets and not be\naffected by complex backgrounds. The training speed can be increased by 7.78\ntimes with better rendering effect, and small to medium sized targets can be\nrendered faster. In addition, this improvement applies to all NeRF-based\nmodels. Considering the inherent multi-view consistency and smoothness of NeRF,\nthis paper also studies weak supervision by sparsely sampling negative ray\nsamples. With this method, training can be further accelerated and rendering\nquality can be maintained. Finally, this paper extends pixel semantic and color\nrendering formulas and proposes a new scene editing technique that can achieve\nunique displays of the specific semantic targets or masking them in rendering.\nTo address the problem of unsupervised regions incorrect inferences in the\nscene, we also designed a self-supervised loop that combines morphological\noperations and clustering.\n","authors":["Yuesong Li","Feng Pan","Helong Yan","Xiuli Xin","Xiaoxue Feng"],"pdf_url":"https://arxiv.org/pdf/2308.05970v1.pdf","comment":"17 pages,32 figures"},{"id":"http://arxiv.org/abs/2308.05967v1","updated":"2023-08-11T06:54:55Z","published":"2023-08-11T06:54:55Z","title":"YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease\n  Detection","summary":"  Detecting dental diseases through panoramic X-rays images is a standard\nprocedure for dentists. Normally, a dentist need to identify diseases and find\nthe infected teeth. While numerous machine learning models adopting this\ntwo-step procedure have been developed, there has not been an end-to-end model\nthat can identify teeth and their associated diseases at the same time. To fill\nthe gap, we develop YOLOrtho, a unified framework for teeth enumeration and\ndental disease detection. We develop our model on Dentex Challenge 2023 data,\nwhich consists of three distinct types of annotated data. The first part is\nlabeled with quadrant, and the second part is labeled with quadrant and\nenumeration and the third part is labeled with quadrant, enumeration and\ndisease. To further improve detection, we make use of Tufts Dental public\ndataset. To fully utilize the data and learn both teeth detection and disease\nidentification simultaneously, we formulate diseases as attributes attached to\ntheir corresponding teeth. Due to the nature of position relation in teeth\nenumeration, We replace convolution layer with CoordConv in our model to\nprovide more position information for the model. We also adjust the model\narchitecture and insert one more upsampling layer in FPN in favor of large\nobject detection. Finally, we propose a post-process strategy for teeth layout\nthat corrects teeth enumeration based on linear sum assignment. Results from\nexperiments show that our model exceeds large Diffusion-based model.\n","authors":["Shenxiao Mei","Chenglong Ma","Feihong Shen","Huikai Wu"],"pdf_url":"https://arxiv.org/pdf/2308.05967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05961v1","updated":"2023-08-11T06:41:20Z","published":"2023-08-11T06:41:20Z","title":"Compositional Learning in Transformer-Based Human-Object Interaction\n  Detection","summary":"  Human-object interaction (HOI) detection is an important part of\nunderstanding human activities and visual scenes. The long-tailed distribution\nof labeled instances is a primary challenge in HOI detection, promoting\nresearch in few-shot and zero-shot learning. Inspired by the combinatorial\nnature of HOI triplets, some existing approaches adopt the idea of\ncompositional learning, in which object and action features are learned\nindividually and re-composed as new training samples. However, these methods\nfollow the CNN-based two-stage paradigm with limited feature extraction\nability, and often rely on auxiliary information for better performance.\nWithout introducing any additional information, we creatively propose a\ntransformer-based framework for compositional HOI learning. Human-object pair\nrepresentations and interaction representations are re-composed across\ndifferent HOI instances, which involves richer contextual information and\npromotes the generalization of knowledge. Experiments show our simple but\neffective method achieves state-of-the-art performance, especially on rare HOI\nclasses.\n","authors":["Zikun Zhuang","Ruihao Qian","Chi Xie","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05959v1","updated":"2023-08-11T06:28:19Z","published":"2023-08-11T06:28:19Z","title":"Learned Point Cloud Compression for Classification","summary":"  Deep learning is increasingly being used to perform machine vision tasks such\nas classification, object detection, and segmentation on 3D point cloud data.\nHowever, deep learning inference is computationally expensive. The limited\ncomputational capabilities of end devices thus necessitate a codec for\ntransmitting point cloud data over the network for server-side processing. Such\na codec must be lightweight and capable of achieving high compression ratios\nwithout sacrificing accuracy. Motivated by this, we present a novel point cloud\ncodec that is highly specialized for the machine task of classification. Our\ncodec, based on PointNet, achieves a significantly better rate-accuracy\ntrade-off in comparison to alternative methods. In particular, it achieves a\n94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40\ndataset. For low-resource end devices, we also propose two lightweight\nconfigurations of our encoder that achieve similar BD-bitrate reductions of 93%\nand 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and\n0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the\npotential of specialized codecs for machine analysis of point clouds, and\nprovides a basis for extension to more complex tasks and datasets in the\nfuture.\n","authors":["Mateen Ulhaq","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2308.05959v1.pdf","comment":"6 pages, 4 figures, IEEE MMSP 2023"},{"id":"http://arxiv.org/abs/2308.04830v2","updated":"2023-08-11T05:56:35Z","published":"2023-08-09T09:38:14Z","title":"VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style\n  Transfer","summary":"  Current talking face generation methods mainly focus on speech-lip\nsynchronization. However, insufficient investigation on the facial talking\nstyle leads to a lifeless and monotonous avatar. Most previous works fail to\nimitate expressive styles from arbitrary video prompts and ensure the\nauthenticity of the generated video. This paper proposes an unsupervised\nvariational style transfer model (VAST) to vivify the neutral photo-realistic\navatars. Our model consists of three key components: a style encoder that\nextracts facial style representations from the given video prompts; a hybrid\nfacial expression decoder to model accurate speech-related movements; a\nvariational style enhancer that enhances the style space to be highly\nexpressive and meaningful. With our essential designs on facial style learning,\nour model is able to flexibly capture the expressive facial style from\narbitrary video prompts and transfer it onto a personalized image renderer in a\nzero-shot manner. Experimental results demonstrate the proposed approach\ncontributes to a more vivid talking avatar with higher authenticity and richer\nexpressiveness.\n","authors":["Liyang Chen","Zhiyong Wu","Runnan Li","Weihong Bao","Jun Ling","Xu Tan","Sheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.04830v2.pdf","comment":"Accepted by ICCV2023 Workshop"},{"id":"http://arxiv.org/abs/2308.05948v1","updated":"2023-08-11T05:46:52Z","published":"2023-08-11T05:46:52Z","title":"Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape\n  Retrieval","summary":"  In recent years, sketch-based 3D shape retrieval has attracted growing\nattention. While many previous studies have focused on cross-modal matching\nbetween hand-drawn sketches and 3D shapes, the critical issue of how to handle\nlow-quality and noisy samples in sketch data has been largely neglected. This\npaper presents an uncertainty-aware cross-modal transfer network (UACTN) that\naddresses this issue. UACTN decouples the representation learning of sketches\nand 3D shapes into two separate tasks: classification-based sketch uncertainty\nlearning and 3D shape feature transfer. We first introduce an end-to-end\nclassification-based approach that simultaneously learns sketch features and\nuncertainty, allowing uncertainty to prevent overfitting noisy sketches by\nassigning different levels of importance to clean and noisy sketches. Then, 3D\nshape features are mapped into the pre-learned sketch embedding space for\nfeature alignment. Extensive experiments and ablation studies on two benchmarks\ndemonstrate the superiority of our proposed method compared to state-of-the-art\nmethods.\n","authors":["Yiyang Cai","Jiaming Lu","Jiewen Wang","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05948v1.pdf","comment":"6 pages, 7 figures; To be published in IEEE International Conference\n  on Multimedia and Expo 2023"},{"id":"http://arxiv.org/abs/2308.05721v2","updated":"2023-08-11T05:39:02Z","published":"2023-08-10T17:37:49Z","title":"Deformable Mixer Transformer with Gating for Multi-Task Learning of\n  Dense Prediction","summary":"  CNNs and Transformers have their own advantages and both have been widely\nused for dense prediction in multi-task learning (MTL). Most of the current\nstudies on MTL solely rely on CNN or Transformer. In this work, we present a\nnovel MTL model by combining both merits of deformable CNN and query-based\nTransformer with shared gating for multi-task learning of dense prediction.\nThis combination may offer a simple and efficient solution owing to its\npowerful and flexible task-specific learning and advantages of lower cost, less\ncomplexity and smaller parameters than the traditional MTL methods. We\nintroduce deformable mixer Transformer with gating (DeMTG), a simple and\neffective encoder-decoder architecture up-to-date that incorporates the\nconvolution and attention mechanism in a unified network for MTL. It is\nexquisitely designed to use advantages of each block, and provide deformable\nand comprehensive features for all tasks from local and global perspective.\nFirst, the deformable mixer encoder contains two types of operators: the\nchannel-aware mixing operator leveraged to allow communication among different\nchannels, and the spatial-aware deformable operator with deformable convolution\napplied to efficiently sample more informative spatial locations. Second, the\ntask-aware gating transformer decoder is used to perform the task-specific\npredictions, in which task interaction block integrated with self-attention is\napplied to capture task interaction features, and the task query block\nintegrated with gating attention is leveraged to select corresponding\ntask-specific features. Further, the experiment results demonstrate that the\nproposed DeMTG uses fewer GFLOPs and significantly outperforms current\nTransformer-based and CNN-based competitive models on a variety of metrics on\nthree dense prediction datasets. Our code and models are available at\nhttps://github.com/yangyangxu0/DeMTG.\n","authors":["Yangyang Xu","Yibo Yang","Bernard Ghanemm","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05721v2.pdf","comment":"submitted to IJCV; an extension to our previous AAAI 2023 paper\n  arXiv:2301.03461"},{"id":"http://arxiv.org/abs/2303.06358v2","updated":"2023-08-11T05:36:12Z","published":"2023-03-11T09:40:05Z","title":"O2CTA: Introducing Annotations from OCT to CCTA in Coronary Plaque\n  Analysis","summary":"  Targeted diagnosis and treatment plans for patients with coronary artery\ndisease vary according to atherosclerotic plaque component. Coronary CT\nangiography (CCTA) is widely used for artery imaging and determining the\nstenosis degree. However, the limited spatial resolution and susceptibility to\nartifacts fail CCTA in obtaining lumen morphological characteristics and plaque\ncomposition. It can be settled by invasive optical coherence tomography (OCT)\nwithout much trouble for physicians, but bringing higher costs and potential\nrisks to patients. Therefore, it is clinically critical to introduce\nannotations of plaque tissue and lumen characteristics from OCT to paired CCTA\nscans, denoted as \\textbf{the O2CTA problem} in this paper. We propose a method\nto handle the O2CTA problem. CCTA scans are first reconstructed into\nmulti-planar reformatted (MPR) images, which agree with OCT images in term of\nsemantic contents. The artery segment in OCT, which is manually labelled, is\nthen spatially aligned with the entire artery in MPR images via the proposed\nalignment strategy. Finally, a classification model involving a 3D CNN and a\nTransformer, is learned to extract local features and capture dependence along\narteries. Experiments on 55 paired OCT and CCTA we curate demonstrate that it\nis feasible to classify the CCTA based on the OCT labels, with an accuracy of\n86.2%, while the manual readings of OCT and CCTA vary significantly, with a\nKappa coefficient of 0.113. We will make our source codes, models, data, and\nresults publicly available to benefit the research community.\n","authors":["Jun Li","Kexin Li","Yafeng Zhou","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.06358v2.pdf","comment":"Accepted for oral presentation in MICCAI-BTSD 2023 workshop"},{"id":"http://arxiv.org/abs/2308.01097v3","updated":"2023-08-11T05:30:10Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03364v2","updated":"2023-08-11T05:21:15Z","published":"2023-08-07T07:39:39Z","title":"Dual Aggregation Transformer for Image Super-Resolution","summary":"  Transformer has recently gained considerable popularity in low-level vision\ntasks, including image super-resolution (SR). These networks utilize\nself-attention along different dimensions, spatial or channel, and achieve\nimpressive performance. This inspires us to combine the two dimensions in\nTransformer for a more powerful representation capability. Based on the above\nidea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),\nfor image SR. Our DAT aggregates features across spatial and channel\ndimensions, in the inter-block and intra-block dual manner. Specifically, we\nalternately apply spatial and channel self-attention in consecutive Transformer\nblocks. The alternate strategy enables DAT to capture the global context and\nrealize inter-block feature aggregation. Furthermore, we propose the adaptive\ninteraction module (AIM) and the spatial-gate feed-forward network (SGFN) to\nachieve intra-block feature aggregation. AIM complements two self-attention\nmechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional\nnon-linear spatial information in the feed-forward network. Extensive\nexperiments show that our DAT surpasses current methods. Code and models are\nobtainable at https://github.com/zhengchen1999/DAT.\n","authors":["Zheng Chen","Yulun Zhang","Jinjin Gu","Linghe Kong","Xiaokang Yang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03364v2.pdf","comment":"Accepted to ICCV 2023. Code is available at\n  https://github.com/zhengchen1999/DAT"},{"id":"http://arxiv.org/abs/2306.08966v2","updated":"2023-08-11T04:55:40Z","published":"2023-06-15T09:01:33Z","title":"Training Multimedia Event Extraction With Generated Images and Captions","summary":"  Contemporary news reporting increasingly features multimedia content,\nmotivating research on multimedia event extraction. However, the task lacks\nannotated multimodal training data and artificially generated training data\nsuffer from distribution shift from real-world data. In this paper, we propose\nCross-modality Augmented Multimedia Event Learning (CAMEL), which successfully\nutilizes artificially generated multimodal training data and achieves\nstate-of-the-art performance. We start with two labeled unimodal datasets in\ntext and image respectively, and generate the missing modality using\noff-the-shelf image generators like Stable Diffusion and image captioners like\nBLIP. After that, we train the network on the resultant multimodal datasets. In\norder to learn robust features that are effective across domains, we devise an\niterative and gradual training strategy. Substantial experiments show that\nCAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On\nmultimedia events in particular, we outperform the prior SOTA by 4.2% F1 on\nevent mention identification and by 9.8% F1 on argument identification, which\nindicates that CAMEL learns synergistic representations from the two\nmodalities. Our work demonstrates a recipe to unleash the power of synthetic\ntraining data in structured prediction.\n","authors":["Zilin Du","Yunxin Li","Xu Guo","Yidan Sun","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.08966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05938v1","updated":"2023-08-11T04:42:10Z","published":"2023-08-11T04:42:10Z","title":"FoodSAM: Any Food Segmentation","summary":"  In this paper, we explore the zero-shot capability of the Segment Anything\nModel (SAM) for food image segmentation. To address the lack of class-specific\ninformation in SAM-generated masks, we propose a novel framework, called\nFoodSAM. This innovative approach integrates the coarse semantic mask with\nSAM-generated masks to enhance semantic segmentation quality. Besides, we\nrecognize that the ingredients in food can be supposed as independent\nindividuals, which motivated us to perform instance segmentation on food\nimages. Furthermore, FoodSAM extends its zero-shot capability to encompass\npanoptic segmentation by incorporating an object detector, which renders\nFoodSAM to effectively capture non-food object information. Drawing inspiration\nfrom the recent success of promptable segmentation, we also extend FoodSAM to\npromptable segmentation, supporting various prompt variants. Consequently,\nFoodSAM emerges as an all-encompassing solution capable of segmenting food\nitems at multiple levels of granularity. Remarkably, this pioneering framework\nstands as the first-ever work to achieve instance, panoptic, and promptable\nsegmentation on food images. Extensive experiments demonstrate the feasibility\nand impressing performance of FoodSAM, validating SAM's potential as a\nprominent and influential tool within the domain of food image segmentation. We\nrelease our code at https://github.com/jamesjg/FoodSAM.\n","authors":["Xing Lan","Jiayi Lyu","Hanyu Jiang","Kun Dong","Zehai Niu","Yi Zhang","Jian Xue"],"pdf_url":"https://arxiv.org/pdf/2308.05938v1.pdf","comment":"Code is available at https://github.com/jamesjg/FoodSAM"},{"id":"http://arxiv.org/abs/2208.09668v3","updated":"2023-08-11T04:40:10Z","published":"2022-08-20T12:23:32Z","title":"Generalised Co-Salient Object Detection","summary":"  We propose a new setting that relaxes an assumption in the conventional\nCo-Salient Object Detection (CoSOD) setting by allowing the presence of \"noisy\nimages\" which do not show the shared co-salient object. We call this new\nsetting Generalised Co-Salient Object Detection (GCoSOD). We propose a novel\nrandom sampling based Generalised CoSOD Training (GCT) strategy to distill the\nawareness of inter-image absence of co-salient objects into CoSOD models. It\nemploys a Diverse Sampling Self-Supervised Learning (DS3L) that, in addition to\nthe provided supervised co-salient label, introduces additional self-supervised\nlabels for noisy images (being null, that no co-salient object is present).\nFurther, the random sampling process inherent in GCT enables the generation of\na high-quality uncertainty map highlighting potential false-positive\npredictions at instance level. To evaluate the performance of CoSOD models\nunder the GCoSOD setting, we propose two new testing datasets, namely\nCoCA-Common and CoCA-Zero, where a common salient object is partially present\nin the former and completely absent in the latter. Extensive experiments\ndemonstrate that our proposed method significantly improves the performance of\nCoSOD models in terms of the performance under the GCoSOD setting as well as\nthe model calibration degrees.\n","authors":["Jiawei Liu","Jing Zhang","Ruikai Cui","Kaihao Zhang","Weihao Li","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2208.09668v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05483v2","updated":"2023-08-11T04:28:51Z","published":"2022-09-12T06:14:04Z","title":"Self-Supervised Coordinate Projection Network for Sparse-View Computed\n  Tomography","summary":"  In the present work, we propose a Self-supervised COordinate Projection\nnEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV\nsinogram by solving the inverse tomography imaging problem. Compared with\nrecent related works that solve similar problems using implicit neural\nrepresentation network (INR), our essential contribution is an effective and\nsimple re-projection strategy that pushes the tomography image reconstruction\nquality over supervised deep learning CT reconstruction works. The proposed\nstrategy is inspired by the simple relationship between linear algebra and\ninverse problems. To solve the under-determined linear equation system, we\nfirst introduce INR to constrain the solution space via image continuity prior\nand achieve a rough solution. And secondly, we propose to generate a dense view\nsinogram that improves the rank of the linear equation system and produces a\nmore stable CT image solution space. Our experiment results demonstrate that\nthe re-projection strategy significantly improves the image reconstruction\nquality (+3 dB for PSNR at least). Besides, we integrate the recent hash\nencoding into our SCOPE model, which greatly accelerates the model training.\nFinally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction\ntasks. Experimental results indicate that the proposed SCOPE model outperforms\ntwo latest INR-based methods and two well-popular supervised DL methods\nquantitatively and qualitatively.\n","authors":["Qing Wu","Ruimin Feng","Hongjiang Wei","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.05483v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05932v1","updated":"2023-08-11T04:27:29Z","published":"2023-08-11T04:27:29Z","title":"Generalizing Event-Based Motion Deblurring in Real-World Scenarios","summary":"  Event-based motion deblurring has shown promising results by exploiting\nlow-latency events. However, current approaches are limited in their practical\nusage, as they assume the same spatial resolution of inputs and specific\nblurriness distributions. This work addresses these limitations and aims to\ngeneralize the performance of event-based deblurring in real-world scenarios.\nWe propose a scale-aware network that allows flexible input spatial scales and\nenables learning from different temporal scales of motion blur. A two-stage\nself-supervised learning scheme is then developed to fit real-world data\ndistribution. By utilizing the relativity of blurriness, our approach\nefficiently ensures the restored brightness and structure of latent images and\nfurther generalizes deblurring performance to handle varying spatial and\ntemporal scales of motion blur in a self-distillation manner. Our method is\nextensively evaluated, demonstrating remarkable performance, and we also\nintroduce a real-world dataset consisting of multi-scale blurry frames and\nevents to facilitate research in event-based deblurring.\n","authors":["Xiang Zhang","Lei Yu","Wen Yang","Jianzhuang Liu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.05932v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.03678v3","updated":"2023-08-11T04:23:29Z","published":"2023-05-05T16:48:45Z","title":"Towards Segment Anything Model (SAM) for Medical Image Segmentation: A\n  Survey","summary":"  Due to the flexibility of prompting, foundation models have become the\ndominant force in the domains of natural language processing and image\ngeneration. With the recent introduction of the Segment Anything Model (SAM),\nthe prompt-driven paradigm has entered the realm of image segmentation,\nbringing with a range of previously unexplored capabilities. However, it\nremains unclear whether it can be applicable to medical image segmentation due\nto the significant differences between natural images and medical images.In\nthis work, we summarize recent efforts to extend the success of SAM to medical\nimage segmentation tasks, including both empirical benchmarking and\nmethodological adaptations, and discuss potential future directions for SAM in\nmedical image segmentation. Although directly applying SAM to medical image\nsegmentation cannot obtain satisfying performance on multi-modal and\nmulti-target medical datasets, many insights are drawn to guide future research\nto develop foundation models for medical image analysis. To facilitate future\nresearch, we maintain an active repository that contains up-to-date paper list\nand open-source project summary at https://github.com/YichiZhang98/SAM4MIS.\n","authors":["Yichi Zhang","Rushi Jiao"],"pdf_url":"https://arxiv.org/pdf/2305.03678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06564v3","updated":"2023-08-11T04:17:56Z","published":"2023-05-11T04:43:10Z","title":"Undercover Deepfakes: Detecting Fake Segments in Videos","summary":"  The recent renaissance in generative models, driven primarily by the advent\nof diffusion models and iterative improvement in GAN methods, has enabled many\ncreative applications. However, each advancement is also accompanied by a rise\nin the potential for misuse. In the arena of the deepfake generation, this is a\nkey societal issue. In particular, the ability to modify segments of videos\nusing such generative techniques creates a new paradigm of deepfakes which are\nmostly real videos altered slightly to distort the truth.This paradigm has been\nunder-explored by the current deepfake detection methods in the academic\nliterature. In this paper, we present a deepfake detection method that can\naddress this issue by performing deepfake prediction at the frame and video\nlevels. To facilitate testing our method, we prepared a new benchmark dataset\nwhere videos have both real and fake frame sequences with very subtle\ntransitions. We provide a benchmark on the proposed dataset with our detection\nmethod which utilizes the Vision Transformer based on Scaling and Shifting to\nlearn spatial features, and a Timeseries Transformer to learn temporal features\nof the videos to help facilitate the interpretation of possible deepfakes.\nExtensive experiments on a variety of deepfake generation methods show\nexcellent results by the proposed method on temporal segmentation and classical\nvideo-level predictions as well. In particular, the paradigm we address will\nform a powerful tool for the moderation of deepfakes, where human oversight can\nbe better targeted to the parts of videos suspected of being deepfakes. All\nexperiments can be reproduced at: https://t.ly/\\_bOh9.\n","authors":["Sanjay Saha","Rashindrie Perera","Sachith Seneviratne","Tamasha Malepathirana","Sanka Rasnayaka","Deshani Geethika","Terence Sim","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2305.06564v3.pdf","comment":"ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection"},{"id":"http://arxiv.org/abs/2306.15932v2","updated":"2023-08-11T04:17:11Z","published":"2023-06-28T05:33:11Z","title":"NIPD: A Federated Learning Person Detection Benchmark Based on\n  Real-World Non-IID Data","summary":"  Federated learning (FL), a privacy-preserving distributed machine learning,\nhas been rapidly applied in wireless communication networks. FL enables\nInternet of Things (IoT) clients to obtain well-trained models while preventing\nprivacy leakage. Person detection can be deployed on edge devices with limited\ncomputing power if combined with FL to process the video data directly at the\nedge. However, due to the different hardware and deployment scenarios of\ndifferent cameras, the data collected by the camera present non-independent and\nidentically distributed (non-IID), and the global model derived from FL\naggregation is less effective. Meanwhile, existing research lacks public data\nset for real-world FL object detection, which is not conducive to studying the\nnon-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person\ndetection (NIPD) data set, which is collected from five different cameras. To\nour knowledge, this is the first true device-based non-IID person detection\ndata set. Based on this data set, we explain how to establish a FL experimental\nplatform and provide a benchmark for non-IID person detection. NIPD is expected\nto promote the application of FL and the security of smart city.\n","authors":["Kangning Yin","Zhen Ding","Zhihua Dong","Dongsheng Chen","Jie Fu","Xinhui Ji","Guangqiang Yin","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15932v2.pdf","comment":"8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference"},{"id":"http://arxiv.org/abs/2308.05925v1","updated":"2023-08-11T04:01:13Z","published":"2023-08-11T04:01:13Z","title":"CaPhy: Capturing Physical Properties for Animatable Human Avatars","summary":"  We present CaPhy, a novel method for reconstructing animatable human avatars\nwith realistic dynamic properties for clothing. Specifically, we aim for\ncapturing the geometric and physical properties of the clothing from real\nobservations. This allows us to apply novel poses to the human avatar with\nphysically correct deformations and wrinkles of the clothing. To this end, we\ncombine unsupervised training with physics-based losses and 3D-supervised\ntraining using scanned data to reconstruct a dynamic model of clothing that is\nphysically realistic and conforms to the human scans. We also optimize the\nphysical parameters of the underlying physical model from the scans by\nintroducing gradient constraints of the physics-based losses. In contrast to\nprevious work on 3D avatar reconstruction, our method is able to generalize to\nnovel poses with realistic dynamic cloth deformations. Experiments on several\nsubjects demonstrate that our method can estimate the physical properties of\nthe garments, resulting in superior quantitative and qualitative results\ncompared with previous methods.\n","authors":["Zhaoqi Su","Liangxiao Hu","Siyou Lin","Hongwen Zhang","Shengping Zhang","Justus Thies","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05921v1","updated":"2023-08-11T03:22:33Z","published":"2023-08-11T03:22:33Z","title":"BATINet: Background-Aware Text to Image Synthesis and Manipulation\n  Network","summary":"  Background-Induced Text2Image (BIT2I) aims to generate foreground content\naccording to the text on the given background image. Most studies focus on\ngenerating high-quality foreground content, although they ignore the\nrelationship between the two contents. In this study, we analyzed a novel\nBackground-Aware Text2Image (BAT2I) task in which the generated content matches\nthe input background. We proposed a Background-Aware Text to Image synthesis\nand manipulation Network (BATINet), which contains two key components: Position\nDetect Network (PDN) and Harmonize Network (HN). The PDN detects the most\nplausible position of the text-relevant object in the background image. The HN\nharmonizes the generated content referring to background style information.\nFinally, we reconstructed the generation network, which consists of the\nmulti-GAN and attention module to match more user preferences. Moreover, we can\napply BATINet to text-guided image manipulation. It solves the most challenging\ntask of manipulating the shape of an object. We demonstrated through\nqualitative and quantitative evaluations on the CUB dataset that the proposed\nmodel outperforms other state-of-the-art methods.\n","authors":["Ryugo Morita","Zhiqiang Zhang","Jinjia Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.05921v1.pdf","comment":"Accepted to ICIP2023"},{"id":"http://arxiv.org/abs/2111.11011v4","updated":"2023-08-11T03:17:54Z","published":"2021-11-22T06:27:29Z","title":"CDistNet: Perceiving Multi-Domain Character Distance for Robust Text\n  Recognition","summary":"  The Transformer-based encoder-decoder framework is becoming popular in scene\ntext recognition, largely because it naturally integrates recognition clues\nfrom both visual and semantic domains. However, recent studies show that the\ntwo kinds of clues are not always well registered and therefore, feature and\ncharacter might be misaligned in difficult text (e.g., with a rare shape). As a\nresult, constraints such as character position are introduced to alleviate this\nproblem. Despite certain success, visual and semantic are still separately\nmodeled and they are merely loosely associated. In this paper, we propose a\nnovel module called Multi-Domain Character Distance Perception (MDCDP) to\nestablish a visually and semantically related position embedding. MDCDP uses\nthe position embedding to query both visual and semantic features following the\ncross-attention mechanism. The two kinds of clues are fused into the position\nbranch, generating a content-aware embedding that well perceives character\nspacing and orientation variants, character semantic affinities, and clues\ntying the two kinds of information. They are summarized as the multi-domain\ncharacter distance. We develop CDistNet that stacks multiple MDCDPs to guide a\ngradually precise distance modeling. Thus, the feature-character alignment is\nwell built even various recognition difficulties are presented. We verify\nCDistNet on ten challenging public datasets and two series of augmented\ndatasets created by ourselves. The experiments demonstrate that CDistNet\nperforms highly competitively. It not only ranks top-tier in standard\nbenchmarks, but also outperforms recent popular methods by obvious margins on\nreal and augmented datasets presenting severe text deformation, poor linguistic\nsupport, and rare character layouts. Code is available at\nhttps://github.com/simplify23/CDistNet.\n","authors":["Tianlun Zheng","Zhineng Chen","Shancheng Fang","Hongtao Xie","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2111.11011v4.pdf","comment":"Paper accepted for publication at IJCV 2023"},{"id":"http://arxiv.org/abs/2305.14677v2","updated":"2023-08-11T03:11:41Z","published":"2023-05-24T03:33:30Z","title":"Optimal Linear Subspace Search: Learning to Construct Fast and\n  High-Quality Schedulers for Diffusion Models","summary":"  In recent years, diffusion models have become the most popular and powerful\nmethods in the field of image synthesis, even rivaling human artists in\nartistic creativity. However, the key issue currently limiting the application\nof diffusion models is its extremely slow generation process. Although several\nmethods were proposed to speed up the generation process, there still exists a\ntrade-off between efficiency and quality. In this paper, we first provide a\ndetailed theoretical and empirical analysis of the generation process of the\ndiffusion models based on schedulers. We transform the designing problem of\nschedulers into the determination of several parameters, and further transform\nthe accelerated generation process into an expansion process of the linear\nsubspace. Based on these analyses, we consequently propose a novel method\ncalled Optimal Linear Subspace Search (OLSS), which accelerates the generation\nprocess by searching for the optimal approximation process of the complete\ngeneration process in the linear subspaces spanned by latent variables. OLSS is\nable to generate high-quality images with a very small number of steps. To\ndemonstrate the effectiveness of our method, we conduct extensive comparative\nexperiments on open-source diffusion models. Experimental results show that\nwith a given number of steps, OLSS can significantly improve the quality of\ngenerated images. Using an NVIDIA A100 GPU, we make it possible to generate a\nhigh-quality image by Stable Diffusion within only one second without other\noptimization techniques.\n","authors":["Zhongjie Duan","Chengyu Wang","Cen Chen","Jun Huang","Weining Qian"],"pdf_url":"https://arxiv.org/pdf/2305.14677v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.11219v2","updated":"2023-08-11T03:07:39Z","published":"2023-03-20T15:50:00Z","title":"NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion\n  Aware Refraction-Tracing","summary":"  We present a novel method, called NeTO, for capturing 3D geometry of solid\ntransparent objects from 2D images via volume rendering. Reconstructing\ntransparent objects is a very challenging task, which is ill-suited for\ngeneral-purpose reconstruction techniques due to the specular light transport\nphenomena. Although existing refraction-tracing based methods, designed\nspecially for this task, achieve impressive results, they still suffer from\nunstable optimization and loss of fine details, since the explicit surface\nrepresentation they adopted is difficult to be optimized, and the\nself-occlusion problem is ignored for refraction-tracing. In this paper, we\npropose to leverage implicit Signed Distance Function (SDF) as surface\nrepresentation, and optimize the SDF field via volume rendering with a\nself-occlusion aware refractive ray tracing. The implicit representation\nenables our method to be capable of reconstructing high-quality reconstruction\neven with a limited set of images, and the self-occlusion aware strategy makes\nit possible for our method to accurately reconstruct the self-occluded regions.\nExperiments show that our method achieves faithful reconstruction results and\noutperforms prior works by a large margin. Visit our project page at\n\\url{https://www.xxlong.site/NeTO/}\n","authors":["Zongcheng Li","Xiaoxiao Long","Yusen Wang","Tuo Cao","Wenping Wang","Fei Luo","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2303.11219v2.pdf","comment":"Experiments involving sparse views have some flaws, mainly including\n  Figure 1 in the introduction, Figure 7 and Table 1 in the experiments. In\n  order to maintain correctness and fairness, we would like to retract the\n  paper first"},{"id":"http://arxiv.org/abs/2308.05920v1","updated":"2023-08-11T03:07:31Z","published":"2023-08-11T03:07:31Z","title":"Semantics2Hands: Transferring Hand Motion Semantics between Avatars","summary":"  Human hands, the primary means of non-verbal communication, convey intricate\nsemantics in various scenarios. Due to the high sensitivity of individuals to\nhand motions, even minor errors in hand motions can significantly impact the\nuser experience. Real applications often involve multiple avatars with varying\nhand shapes, highlighting the importance of maintaining the intricate semantics\nof hand motions across the avatars. Therefore, this paper aims to transfer the\nhand motion semantics between diverse avatars based on their respective hand\nmodels. To address this problem, we introduce a novel anatomy-based semantic\nmatrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the\npositions of the palm and other joints relative to the local frame of the\ncorresponding joint, enabling precise retargeting of hand motions.\nSubsequently, we obtain a mapping function from the source ASM to the target\nhand joint rotations by employing an anatomy-based semantics reconstruction\nnetwork (ASRN). We train the ASRN using a semi-supervised learning strategy on\nthe Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain\nand cross-domain hand motion retargeting tasks. The qualitative and\nquantitative results demonstrate the significant superiority of our ASRN over\nthe state-of-the-arts.\n","authors":["Zijie Ye","Jia Jia","Junliang Xing"],"pdf_url":"https://arxiv.org/pdf/2308.05920v1.pdf","comment":"Accepted to MM 2023, 9 pages, 10 figures. Project page:\n  https://abcyzj.github.io/S2H/"},{"id":"http://arxiv.org/abs/2209.10510v3","updated":"2023-08-11T03:07:28Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v3.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.05911v1","updated":"2023-08-11T02:25:58Z","published":"2023-08-11T02:25:58Z","title":"Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object\n  Tracking","summary":"  Multi-object tracking (MOT) at low frame rates can reduce computational,\nstorage and power overhead to better meet the constraints of edge devices. Many\nexisting MOT methods suffer from significant performance degradation in\nlow-frame-rate videos due to significant location and appearance changes\nbetween adjacent frames. To this end, we propose to explore collaborative\ntracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based\nend-to-end manner. Multiple historical queries of the same target jointly track\nit with richer temporal descriptions. Meanwhile, we insert an information\nrefinement module between every two temporal blocking decoders to better fuse\ntemporal clues and refine features. Moreover, a tracking object consistency\nloss is proposed to guide the interaction between historical queries. Extensive\nexperimental results demonstrate that in high-frame-rate videos, ColTrack\nobtains higher performance than state-of-the-art methods on large-scale\ndatasets Dancetrack and BDD100K, and outperforms the existing end-to-end\nmethods on MOT17. More importantly, ColTrack has a significant advantage over\nstate-of-the-art methods in low-frame-rate videos, which allows it to obtain\nfaster processing speeds by reducing frame-rate requirements while maintaining\nhigher performance. Code will be released at\nhttps://github.com/yolomax/ColTrack\n","authors":["Yiheng Liu","Junta Wu","Yi Fu"],"pdf_url":"https://arxiv.org/pdf/2308.05911v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2301.11189v3","updated":"2023-08-11T02:21:27Z","published":"2023-01-26T15:55:43Z","title":"Improving Statistical Fidelity for Neural Image Compression with\n  Implicit Local Likelihood Models","summary":"  Lossy image compression aims to represent images in as few bits as possible\nwhile maintaining fidelity to the original. Theoretical results indicate that\noptimizing distortion metrics such as PSNR or MS-SSIM necessarily leads to a\ndiscrepancy in the statistics of original images from those of reconstructions,\nin particular at low bitrates, often manifested by the blurring of the\ncompressed images. Previous work has leveraged adversarial discriminators to\nimprove statistical fidelity. Yet these binary discriminators adopted from\ngenerative modeling tasks may not be ideal for image compression. In this\npaper, we introduce a non-binary discriminator that is conditioned on quantized\nlocal image representations obtained via VQ-VAE autoencoders. Our evaluations\non the CLIC2020, DIV2K and Kodak datasets show that our discriminator is more\neffective for jointly optimizing distortion (e.g., PSNR) and statistical\nfidelity (e.g., FID) than the PatchGAN of the state-of-the-art HiFiC model. On\nCLIC2020, we obtain the same FID as HiFiC with 30-40\\% fewer bits.\n","authors":["Matthew J. Muckley","Alaaeldin El-Nouby","Karen Ullrich","Hervé Jégou","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2301.11189v3.pdf","comment":"Upload camera-ready to arXiv. Official version available at\n  https://proceedings.mlr.press/v202/muckley23a.html"},{"id":"http://arxiv.org/abs/2308.02463v2","updated":"2023-08-11T02:19:33Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":"  In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08058v3","updated":"2023-08-11T01:25:14Z","published":"2023-02-16T03:40:40Z","title":"Learning Non-Local Spatial-Angular Correlation for Light Field Image\n  Super-Resolution","summary":"  Exploiting spatial-angular correlation is crucial to light field (LF) image\nsuper-resolution (SR), but is highly challenging due to its non-local property\ncaused by the disparities among LF images. Although many deep neural networks\n(DNNs) have been developed for LF image SR and achieved continuously improved\nperformance, existing methods cannot well leverage the long-range\nspatial-angular correlation and thus suffer a significant performance drop when\nhandling scenes with large disparity variations. In this paper, we propose a\nsimple yet effective method to learn the non-local spatial-angular correlation\nfor LF image SR. In our method, we adopt the epipolar plane image (EPI)\nrepresentation to project the 4D spatial-angular correlation onto multiple 2D\nEPI planes, and then develop a Transformer network with repetitive\nself-attention operations to learn the spatial-angular correlation by modeling\nthe dependencies between each pair of EPI pixels. Our method can fully\nincorporate the information from all angular views while achieving a global\nreceptive field along the epipolar line. We conduct extensive experiments with\ninsightful visualizations to validate the effectiveness of our method.\nComparative results on five public datasets show that our method not only\nachieves state-of-the-art SR performance, but also performs robust to disparity\nvariations. Code is publicly available at\nhttps://github.com/ZhengyuLiang24/EPIT.\n","authors":["Zhengyu Liang","Yingqian Wang","Longguang Wang","Jungang Yang","Shilin Zhou","Yulan Guo"],"pdf_url":"https://arxiv.org/pdf/2302.08058v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04402v3","updated":"2023-08-11T01:21:44Z","published":"2023-08-08T17:04:53Z","title":"Person Re-Identification without Identification via Event Anonymization","summary":"  Wide-scale use of visual surveillance in public spaces puts individual\nprivacy at stake while increasing resource consumption (energy, bandwidth, and\ncomputation). Neuromorphic vision sensors (event-cameras) have been recently\nconsidered a valid solution to the privacy issue because they do not capture\ndetailed RGB visual information of the subjects in the scene. However, recent\ndeep learning architectures have been able to reconstruct images from event\ncameras with high fidelity, reintroducing a potential threat to privacy for\nevent-based vision applications. In this paper, we aim to anonymize\nevent-streams to protect the identity of human subjects against such image\nreconstruction attacks. To achieve this, we propose an end-to-end network\narchitecture jointly optimized for the twofold objective of preserving privacy\nand performing a downstream task such as person ReId. Our network learns to\nscramble events, enforcing the degradation of images recovered from the privacy\nattacker. In this work, we also bring to the community the first ever\nevent-based person ReId dataset gathered to evaluate the performance of our\napproach. We validate our approach with extensive experiments and report\nresults on the synthetic event data simulated from the publicly available\nSoftBio dataset and our proposed Event-ReId dataset.\n","authors":["Shafiq Ahmad","Pietro Morerio","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.04402v3.pdf","comment":"Accepted at International Conference on Computer Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.05896v1","updated":"2023-08-11T01:11:46Z","published":"2023-08-11T01:11:46Z","title":"Semantic-embedded Similarity Prototype for Scene Recognition","summary":"  Due to the high inter-class similarity caused by the complex composition\nwithin scenes and the co-existing objects across scenes, various studies have\nexplored object semantic knowledge within scenes to improve scene recognition.\nHowever, a resulting issue arises as semantic segmentation or object detection\ntechniques demand heavy computational power, thereby burdening the network\nconsiderably. This limitation often renders object-assisted approaches\nincompatible with edge devices. In contrast, this paper proposes a\nsemantic-based similarity prototype that assists the scene recognition network\nto achieve higher accuracy without increasing network parameters. It is simple\nand can be plug-and-played into existing pipelines. More specifically, a\nstatistical strategy is introduced to depict semantic knowledge in scenes as\nclass-level semantic representations. These representations are utilized to\nexplore inter-class correlations, ultimately constructing a similarity\nprototype. Furthermore, we propose two ways to use the similarity prototype to\nsupport network training from the perspective of gradient label softening and\nbatch-level contrastive loss, respectively. Comprehensive evaluations on\nmultiple benchmarks show that our similarity prototype enhances the performance\nof existing networks without adding any computational burden. Code and the\nstatistical similarity prototype will be available soon.\n","authors":["Chuanxin Song","Hanbo Wu","Xin Ma"],"pdf_url":"https://arxiv.org/pdf/2308.05896v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2210.17020v2","updated":"2023-08-11T00:47:30Z","published":"2022-10-31T02:25:38Z","title":"A Law of Data Separation in Deep Learning","summary":"  While deep learning has enabled significant advances in many areas of\nscience, its black-box nature hinders architecture design for future artificial\nintelligence applications and interpretation for high-stakes decision makings.\nWe addressed this issue by studying the fundamental question of how deep neural\nnetworks process data in the intermediate layers. Our finding is a simple and\nquantitative law that governs how deep neural networks separate data according\nto class membership throughout all layers for classification. This law shows\nthat each layer improves data separation at a constant geometric rate, and its\nemergence is observed in a collection of network architectures and datasets\nduring training. This law offers practical guidelines for designing\narchitectures, improving model robustness and out-of-sample performance, as\nwell as interpreting the predictions.\n","authors":["Hangfeng He","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2210.17020v2.pdf","comment":"Accepted at PNAS"},{"id":"http://arxiv.org/abs/2201.07646v4","updated":"2023-08-11T00:13:54Z","published":"2022-01-19T15:23:46Z","title":"A Survey on Training Challenges in Generative Adversarial Networks for\n  Biomedical Image Analysis","summary":"  In biomedical image analysis, the applicability of deep learning methods is\ndirectly impacted by the quantity of image data available. This is due to deep\nlearning models requiring large image datasets to provide high-level\nperformance. Generative Adversarial Networks (GANs) have been widely utilized\nto address data limitations through the generation of synthetic biomedical\nimages. GANs consist of two models. The generator, a model that learns how to\nproduce synthetic images based on the feedback it receives. The discriminator,\na model that classifies an image as synthetic or real and provides feedback to\nthe generator. Throughout the training process, a GAN can experience several\ntechnical challenges that impede the generation of suitable synthetic imagery.\nFirst, the mode collapse problem whereby the generator either produces an\nidentical image or produces a uniform image from distinct input features.\nSecond, the non-convergence problem whereby the gradient descent optimizer\nfails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem\nwhereby unstable training behavior occurs due to the discriminator achieving\noptimal classification performance resulting in no meaningful feedback being\nprovided to the generator. These problems result in the production of synthetic\nimagery that is blurry, unrealistic, and less diverse. To date, there has been\nno survey article outlining the impact of these technical challenges in the\ncontext of the biomedical imagery domain. This work presents a review and\ntaxonomy based on solutions to the training problems of GANs in the biomedical\nimaging domain. This survey highlights important challenges and outlines future\nresearch directions about the training of GANs in the domain of biomedical\nimagery.\n","authors":["Muhammad Muneeb Saad","Ruairi O'Reilly","Mubashir Husain Rehmani"],"pdf_url":"https://arxiv.org/pdf/2201.07646v4.pdf","comment":"Submitted to the AI Review Journal"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.06212v1","updated":"2023-08-11T16:30:44Z","published":"2023-08-11T16:30:44Z","title":"A Large Language Model Enhanced Conversational Recommender System","summary":"  Conversational recommender systems (CRSs) aim to recommend high-quality items\nto users through a dialogue interface. It usually contains multiple sub-tasks,\nsuch as user preference elicitation, recommendation, explanation, and item\ninformation search. To develop effective CRSs, there are some challenges: 1)\nhow to properly manage sub-tasks; 2) how to effectively solve different\nsub-tasks; and 3) how to correctly generate responses that interact with users.\nRecently, Large Language Models (LLMs) have exhibited an unprecedented ability\nto reason and generate, presenting a new opportunity to develop more powerful\nCRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to\naddress the above challenges. For sub-task management, we leverage the\nreasoning ability of LLM to effectively manage sub-task. For sub-task solving,\nwe collaborate LLM with expert models of different sub-tasks to achieve the\nenhanced performance. For response generation, we utilize the generation\nability of LLM as a language interface to better interact with users.\nSpecifically, LLMCRS divides the workflow into four stages: sub-task detection,\nmodel matching, sub-task execution, and response generation. LLMCRS also\ndesigns schema-based instruction, demonstration-based instruction, dynamic\nsub-task and model matching, and summary-based generation to instruct LLM to\ngenerate desired results in the workflow. Finally, to adapt LLM to\nconversational recommendations, we also propose to fine-tune LLM with\nreinforcement learning from CRSs performance feedback, referred to as RLPF.\nExperimental results on benchmark datasets show that LLMCRS with RLPF\noutperforms the existing methods.\n","authors":["Yue Feng","Shuchang Liu","Zhenghai Xue","Qingpeng Cai","Lantao Hu","Peng Jiang","Kun Gai","Fei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12356v2","updated":"2023-08-11T16:15:52Z","published":"2022-08-25T22:10:18Z","title":"Lib-SibGMU -- A University Library Circulation Dataset for Recommender\n  Systems Developmen","summary":"  We opensource under CC BY 4.0 license Lib-SibGMU - a university library\ncirculation dataset - for a wide research community, and benchmark major\nalgorithms for recommender systems on this dataset. For a recommender\narchitecture that consists of a vectorizer that turns the history of the books\nborrowed into a vector, and a neighborhood-based recommender, trained\nseparately, we show that using the fastText model as a vectorizer delivers\ncompetitive results.\n","authors":["Eduard Zubchuk","Mikhail Arhipkin","Dmitry Menshikov","Aleksandr Karaush","Nikolay Mikhaylovskiy"],"pdf_url":"https://arxiv.org/pdf/2208.12356v2.pdf","comment":"Dataset copyright discussion"},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n  and Transformer Based Models","summary":"  The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06091v1","updated":"2023-08-11T12:04:36Z","published":"2023-08-11T12:04:36Z","title":"Toward a Better Understanding of Loss Functions for Collaborative\n  Filtering","summary":"  Collaborative filtering (CF) is a pivotal technique in modern recommender\nsystems. The learning process of CF models typically consists of three\ncomponents: interaction encoder, loss function, and negative sampling. Although\nmany existing studies have proposed various CF models to design sophisticated\ninteraction encoders, recent work shows that simply reformulating the loss\nfunctions can achieve significant performance gains. This paper delves into\nanalyzing the relationship among existing loss functions. Our mathematical\nanalysis reveals that the previous loss functions can be interpreted as\nalignment and uniformity functions: (i) the alignment matches user and item\nrepresentations, and (ii) the uniformity disperses user and item distributions.\nInspired by this analysis, we propose a novel loss function that improves the\ndesign of alignment and uniformity considering the unique patterns of datasets\ncalled Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty\nof MAWU is two-fold: (i) margin-aware alignment (MA) mitigates\nuser/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts\nthe significance between user and item uniformities to reflect the inherent\ncharacteristics of datasets. Extensive experimental results show that MF and\nLightGCN equipped with MAWU are comparable or superior to state-of-the-art CF\nmodels with various loss functions on three public datasets.\n","authors":["Seongmin Park","Mincheol Yoon","Jae-woong Lee","Hogun Park","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06091v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2307.15464v3","updated":"2023-08-11T10:08:00Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":"  Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.06037v1","updated":"2023-08-11T09:32:58Z","published":"2023-08-11T09:32:58Z","title":"Deep Context Interest Network for Click-Through Rate Prediction","summary":"  Click-Through Rate (CTR) prediction, estimating the probability of a user\nclicking on an item, is essential in industrial applications, such as online\nadvertising. Many works focus on user behavior modeling to improve CTR\nprediction performance. However, most of those methods only model users'\npositive interests from users' click items while ignoring the context\ninformation, which is the display items around the clicks, resulting in\ninferior performance. In this paper, we highlight the importance of context\ninformation on user behavior modeling and propose a novel model named Deep\nContext Interest Network (DCIN), which integrally models the click and its\ndisplay context to learn users' context-aware interests. DCIN consists of three\nkey modules: 1) Position-aware Context Aggregation Module (PCAM), which\nperforms aggregation of display items with an attention mechanism; 2)\nFeedback-Context Fusion Module (FCFM), which fuses the representation of clicks\nand display contexts through non-linear feature interaction; 3) Interest\nMatching Module (IMM), which activates interests related with the target item.\nMoreover, we provide our hands-on solution to implement our DCIN model on\nlarge-scale industrial systems. The significant improvements in both offline\nand online evaluations demonstrate the superiority of our proposed DCIN method.\nNotably, DCIN has been deployed on our online advertising system serving the\nmain traffic, which brings 1.5% CTR and 1.5% RPM lift.\n","authors":["Xuyang Hou","Zhe Wang","Qi Liu","Tan Qu","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2308.06037v1.pdf","comment":"accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2205.08776v3","updated":"2023-08-11T09:31:09Z","published":"2022-05-18T07:55:33Z","title":"AdaMCT: Adaptive Mixture of CNN-Transformer for Sequential\n  Recommendation","summary":"  Sequential recommendation (SR) aims to model users dynamic preferences from a\nseries of interactions. A pivotal challenge in user modeling for SR lies in the\ninherent variability of user preferences. An effective SR model is expected to\ncapture both the long-term and short-term preferences exhibited by users,\nwherein the former can offer a comprehensive understanding of stable interests\nthat impact the latter. To more effectively capture such information, we\nincorporate locality inductive bias into the Transformer by amalgamating its\nglobal attention mechanism with a local convolutional filter, and adaptively\nascertain the mixing importance on a personalized basis through layer-aware\nadaptive mixture units, termed as AdaMCT. Moreover, as users may repeatedly\nbrowse potential purchases, it is expected to consider multiple relevant items\nconcurrently in long-/short-term preferences modeling. Given that softmax-based\nattention may promote unimodal activation, we propose the Squeeze-Excitation\nAttention (with sigmoid activation) into SR models to capture multiple\npertinent items (keys) simultaneously. Extensive experiments on three widely\nemployed benchmarks substantiate the effectiveness and efficiency of our\nproposed approach. Source code is available at\nhttps://github.com/juyongjiang/AdaMCT.\n","authors":["Juyong Jiang","Peiyan Zhang","Yingtao Luo","Chaozhuo Li","Jae Boum Kim","Kai Zhang","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2205.08776v3.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.06018v1","updated":"2023-08-11T08:48:56Z","published":"2023-08-11T08:48:56Z","title":"Designing a User Contextual Profile Ontology: A Focus on the Vehicle\n  Sales Domain","summary":"  In the digital age, it is crucial to understand and tailor experiences for\nusers interacting with systems and applications. This requires the creation of\nuser contextual profiles that combine user profiles with contextual\ninformation. However, there is a lack of research on the integration of\ncontextual information with different user profiles. This study aims to address\nthis gap by designing a user contextual profile ontology that considers both\nuser profiles and contextual information on each profile. Specifically, we\npresent a design and development of the user contextual profile ontology with a\nfocus on the vehicle sales domain. Our designed ontology serves as a structural\nfoundation for standardizing the representation of user profiles and contextual\ninformation, enhancing the system's ability to capture user preferences and\ncontextual information of the user accurately. Moreover, we illustrate a case\nstudy using the User Contextual Profile Ontology in generating personalized\nrecommendations for vehicle sales domain.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2308.06018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01118v2","updated":"2023-08-11T07:43:27Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":"  Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05972v1","updated":"2023-08-11T07:09:55Z","published":"2023-08-11T07:09:55Z","title":"Augmented Negative Sampling for Collaborative Filtering","summary":"  Negative sampling is essential for implicit-feedback-based collaborative\nfiltering, which is used to constitute negative signals from massive unlabeled\ndata to guide supervised learning. The state-of-the-art idea is to utilize hard\nnegative samples that carry more useful information to form a better decision\nboundary. To balance efficiency and effectiveness, the vast majority of\nexisting methods follow the two-pass approach, in which the first pass samples\na fixed number of unobserved items by a simple static distribution and then the\nsecond pass selects the final negative items using a more sophisticated\nnegative sampling strategy. However, selecting negative samples from the\noriginal items is inherently restricted, and thus may not be able to contrast\npositive samples well. In this paper, we confirm this observation via\nexperiments and introduce two limitations of existing solutions: ambiguous trap\nand information discrimination. Our response to such limitations is to\nintroduce augmented negative samples. This direction renders a substantial\ntechnical challenge because constructing unconstrained negative samples may\nintroduce excessive noise that distorts the decision boundary. To this end, we\nintroduce a novel generic augmented negative sampling paradigm and provide a\nconcrete instantiation. First, we disentangle hard and easy factors of negative\nitems. Next, we generate new candidate negative samples by augmenting only the\neasy factors in a regulated manner: the direction and magnitude of the\naugmentation are carefully calibrated. Finally, we design an advanced negative\nsampling strategy to identify the final augmented negative samples, which\nconsiders not only the score function used in existing methods but also a new\nmetric called augmentation gain. Extensive experiments on real-world datasets\ndemonstrate that our method significantly outperforms state-of-the-art\nbaselines.\n","authors":["Yuhan Zhao","Rui Chen","Riwei Lai","Qilong Han","Hongtao Song","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2308.05972v1.pdf","comment":"11 pages, 16 figures,"},{"id":"http://arxiv.org/abs/2308.05935v1","updated":"2023-08-11T04:36:26Z","published":"2023-08-11T04:36:26Z","title":"LittleMu: Deploying an Online Virtual Teaching Assistant via\n  Heterogeneous Sources Integration and Chain of Teach Prompts","summary":"  Teaching assistants have played essential roles in the long history of\neducation. However, few MOOC platforms are providing human or virtual teaching\nassistants to support learning for massive online students due to the\ncomplexity of real-world online education scenarios and the lack of training\ndata. In this paper, we present a virtual MOOC teaching assistant, LittleMu\nwith minimum labeled training data, to provide question answering and chit-chat\nservices. Consisting of two interactive modules of heterogeneous retrieval and\nlanguage model prompting, LittleMu first integrates structural, semi- and\nunstructured knowledge sources to support accurate answers for a wide range of\nquestions. Then, we design delicate demonstrations named \"Chain of Teach\"\nprompts to exploit the large-scale pre-trained model to handle complex\nuncollected questions. Except for question answering, we develop other\neducational services such as knowledge-grounded chit-chat. We test the system's\nperformance via both offline evaluation and online deployment. Since May 2020,\nour LittleMu system has served over 80,000 users with over 300,000 queries from\nover 500 courses on XuetangX MOOC platform, which continuously contributes to a\nmore convenient and fair education. Our code, services, and dataset will be\navailable at https://github.com/THU-KEG/VTA.\n","authors":["Shangqing Tu","Zheyuan Zhang","Jifan Yu","Chunyang Li","Siyu Zhang","Zijun Yao","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.05935v1.pdf","comment":"7 pages, 3 figures, Accepted by CIKM 23"},{"id":"http://arxiv.org/abs/2211.00732v3","updated":"2023-08-11T04:00:59Z","published":"2022-10-28T12:54:30Z","title":"Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia","summary":"  Online encyclopedias, such as Wikipedia, have been well-developed and\nresearched in the last two decades. One can find any attributes or other\ninformation of a wiki item on a wiki page edited by a community of volunteers.\nHowever, the traditional text, images and tables can hardly express some\naspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may\ncare more about ``How to feed it'' or ``How to train it not to protect its\nfood''. Currently, short-video platforms have become a hallmark in the online\nworld. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,\nshort-video apps have changed how we consume and create content today. Except\nfor producing short videos for entertainment, we can find more and more authors\nsharing insightful knowledge widely across all walks of life. These short\nvideos, which we call knowledge videos, can easily express any aspects (e.g.\nhair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and\nthey can be systematically analyzed and organized like an online encyclopedia.\nIn this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia\nconsisting of items, aspects, and short videos lined to them, which was\nextracted from billions of videos of Kuaishou (Kwai), a well-known short-video\nplatform in China. We first collected items from multiple sources and mined\nuser-centered aspects from millions of users' queries to build an item-aspect\ntree. Then we propose a new task called ``multi-modal item-aspect linking'' as\nan expansion of ``entity linking'' to link short videos into item-aspect pairs\nand build the whole short-video encyclopedia. Intrinsic evaluations show that\nour encyclopedia is of large scale and highly accurate. We also conduct\nsufficient extrinsic experiments to show how Kuaipedia can help fundamental\napplications such as entity typing and entity linking.\n","authors":["Haojie Pan","Zepeng Zhai","Yuzhou Zhang","Ruiji Fu","Ming Liu","Yangqiu Song","Zhongyuan Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2211.00732v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05902v1","updated":"2023-08-11T01:52:23Z","published":"2023-08-11T01:52:23Z","title":"LTP-MMF: Towards Long-term Provider Max-min Fairness Under\n  Recommendation Feedback Loops","summary":"  Multi-stakeholder recommender systems involve various roles, such as users,\nproviders. Previous work pointed out that max-min fairness (MMF) is a better\nmetric to support weak providers. However, when considering MMF, the features\nor parameters of these roles vary over time, how to ensure long-term provider\nMMF has become a significant challenge. We observed that recommendation\nfeedback loops (named RFL) will influence the provider MMF greatly in the long\nterm. RFL means that recommender system can only receive feedback on exposed\nitems from users and update recommender models incrementally based on this\nfeedback. When utilizing the feedback, the recommender model will regard\nunexposed item as negative. In this way, tail provider will not get the\nopportunity to be exposed, and its items will always be considered as negative\nsamples. Such phenomenons will become more and more serious in RFL. To\nalleviate the problem, this paper proposes an online ranking model named\nLong-Term Provider Max-min Fairness (named LTP-MMF). Theoretical analysis shows\nthat the long-term regret of LTP-MMF enjoys a sub-linear bound. Experimental\nresults on three public recommendation benchmarks demonstrated that LTP-MMF can\noutperform the baselines in the long term.\n","authors":["Chen Xu","Xiaopeng Ye","Jun Xu","Xiao Zhang","Weiran Shen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05902v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.06660"},{"id":"http://arxiv.org/abs/2308.06368v1","updated":"2023-08-11T20:05:01Z","published":"2023-08-11T20:05:01Z","title":"Topic-Level Bayesian Surprise and Serendipity for Recommender Systems","summary":"  A recommender system that optimizes its recommendations solely to fit a\nuser's history of ratings for consumed items can create a filter bubble,\nwherein the user does not get to experience items from novel, unseen\ncategories. One approach to mitigate this undesired behavior is to recommend\nitems with high potential for serendipity, namely surprising items that are\nlikely to be highly rated. In this paper, we propose a content-based\nformulation of serendipity that is rooted in Bayesian surprise and use it to\nmeasure the serendipity of items after they are consumed and rated by the user.\nWhen coupled with a collaborative-filtering component that identifies similar\nusers, this enables recommending items with high potential for serendipity. To\nfacilitate the evaluation of topic-level models for surprise and serendipity,\nwe introduce a dataset of book reading histories extracted from Goodreads,\ncontaining over 26 thousand users and close to 1.3 million books, where we\nmanually annotate 449 books read by 4 users in terms of their time-dependent,\ntopic-level surprise. Experimental evaluations show that models that use\nBayesian surprise correlate much better with the manual annotations of\ntopic-level surprise than distance-based heuristics, and also obtain better\nserendipitous item recommendation performance.\n","authors":["Tonmoy Hasan","Razvan Bunescu"],"pdf_url":"https://arxiv.org/pdf/2308.06368v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.06262v1","updated":"2023-08-11T17:54:44Z","published":"2023-08-11T17:54:44Z","title":"Foundation Model is Efficient Multimodal Multitask Model Selector","summary":"  This paper investigates an under-explored but important problem: given a\ncollection of pre-trained neural networks, predicting their performance on each\nmulti-modal task without fine-tuning them, such as image recognition,\nreferring, captioning, visual question answering, and text question answering.\nA brute-force approach is to finetune all models on all target datasets,\nbringing high computational costs. Although recent-advanced approaches employed\nlightweight metrics to measure models' transferability,they often depend\nheavily on the prior knowledge of a single task, making them inapplicable in a\nmulti-modal multi-task scenario. To tackle this issue, we propose an efficient\nmulti-task model selector (EMMS), which employs large-scale foundation models\nto transform diverse label formats such as categories, texts, and bounding\nboxes of different downstream tasks into a unified noisy label embedding. EMMS\ncan estimate a model's transferability through a simple weighted linear\nregression, which can be efficiently solved by an alternating minimization\nalgorithm with a convergence guarantee. Extensive experiments on 5 downstream\ntasks with 24 datasets show that EMMS is fast, effective, and generic enough to\nassess the transferability of pre-trained models, making it the first model\nselection method in the multi-task scenario. For instance, compared with the\nstate-of-the-art method LogME enhanced by our label embeddings, EMMS achieves\n9.0\\%, 26.3\\%, 20.1\\%, 54.8\\%, 12.2\\% performance gain on image recognition,\nreferring, captioning, visual question answering, and text question answering,\nwhile bringing 5.13x, 6.29x, 3.59x, 6.19x, and 5.66x speedup in wall-clock\ntime, respectively. The code is available at\nhttps://github.com/OpenGVLab/Multitask-Model-Selector.\n","authors":["Fanqing Meng","Wenqi Shao","Zhanglin Peng","Chonghe Jiang","Kaipeng Zhang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.06262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.13148v3","updated":"2023-08-11T17:51:06Z","published":"2021-07-28T03:22:58Z","title":"Combining Machine Learning Classifiers for Stock Trading with Effective\n  Feature Extraction","summary":"  The unpredictability and volatility of the stock market render it challenging\nto make a substantial profit using any generalised scheme. Many previous\nstudies tried different techniques to build a machine learning model, which can\nmake a significant profit in the US stock market by performing live trading.\nHowever, very few studies have focused on the importance of finding the best\nfeatures for a particular trading period. Our top approach used the performance\nto narrow down the features from a total of 148 to about 30. Furthermore, the\ntop 25 features were dynamically selected before each time training our machine\nlearning model. It uses ensemble learning with four classifiers: Gaussian Naive\nBayes, Decision Tree, Logistic Regression with L1 regularization, and\nStochastic Gradient Descent, to decide whether to go long or short on a\nparticular stock. Our best model performed daily trade between July 2011 and\nJanuary 2019, generating 54.35% profit. Finally, our work showcased that\nmixtures of weighted classifiers perform better than any individual predictor\nof making trading decisions in the stock market.\n","authors":["A. K. M. Amanat Ullah","Fahim Imtiaz","Miftah Uddin Md Ihsan","Md. Golam Rabiul Alam","Mahbub Majumdar"],"pdf_url":"https://arxiv.org/pdf/2107.13148v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07640v2","updated":"2023-08-11T17:50:41Z","published":"2023-02-14T14:07:09Z","title":"Detection and classification of vocal productions in large scale audio\n  recordings","summary":"  We propose an automatic data processing pipeline to extract vocal productions\nfrom large-scale natural audio recordings and classify these vocal productions.\nThe pipeline is based on a deep neural network and adresses both issues\nsimultaneously. Though a series of computationel steps (windowing, creation of\na noise class, data augmentation, re-sampling, transfer learning, Bayesian\noptimisation), it automatically trains a neural network without requiring a\nlarge sample of labeled data and important computing resources. Our end-to-end\nmethodology can handle noisy recordings made under different recording\nconditions. We test it on two different natural audio data sets, one from a\ngroup of Guinea baboons recorded from a primate research center and one from\nhuman babies recorded at home. The pipeline trains a model on 72 and 77 minutes\nof labeled audio recordings, with an accuracy of 94.58% and 99.76%. It is then\nused to process 443 and 174 hours of natural continuous recordings and it\ncreates two new databases of 38.8 and 35.2 hours, respectively. We discuss the\nstrengths and limitations of this approach that can be applied to any massive\naudio recording.\n","authors":["Guillem Bonafos","Pierre Pudlo","Jean-Marc Freyermuth","Thierry Legou","Joël Fagot","Samuel Tronçon","Arnaud Rey"],"pdf_url":"https://arxiv.org/pdf/2302.07640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":"  By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2308.06248v1","updated":"2023-08-11T17:29:02Z","published":"2023-08-11T17:29:02Z","title":"FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of\n  Explainable AI Methods","summary":"  The field of explainable artificial intelligence (XAI) aims to uncover the\ninner workings of complex deep neural models. While being crucial for\nsafety-critical domains, XAI inherently lacks ground-truth explanations, making\nits automatic evaluation an unsolved problem. We address this challenge by\nproposing a novel synthetic vision dataset, named FunnyBirds, and accompanying\nautomatic evaluation protocols. Our dataset allows performing semantically\nmeaningful image interventions, e.g., removing individual object parts, which\nhas three important implications. First, it enables analyzing explanations on a\npart level, which is closer to human comprehension than existing methods that\nevaluate on a pixel level. Second, by comparing the model output for inputs\nwith removed parts, we can estimate ground-truth part importances that should\nbe reflected in the explanations. Third, by mapping individual explanations\ninto a common space of part importances, we can analyze a variety of different\nexplanation types in a single common framework. Using our tools, we report\nresults for 24 different combinations of neural models and XAI methods,\ndemonstrating the strengths and weaknesses of the assessed methods in a fully\nautomatic and systematic manner.\n","authors":["Robin Hesse","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.06248v1.pdf","comment":"Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds"},{"id":"http://arxiv.org/abs/2308.06239v1","updated":"2023-08-11T17:15:12Z","published":"2023-08-11T17:15:12Z","title":"Private Distribution Learning with Public Data: The View from Sample\n  Compression","summary":"  We study the problem of private distribution learning with access to public\ndata. In this setup, which we refer to as public-private learning, the learner\nis given public and private samples drawn from an unknown distribution $p$\nbelonging to a class $\\mathcal Q$, with the goal of outputting an estimate of\n$p$ while adhering to privacy constraints (here, pure differential privacy)\nonly with respect to the private samples.\n  We show that the public-private learnability of a class $\\mathcal Q$ is\nconnected to the existence of a sample compression scheme for $\\mathcal Q$, as\nwell as to an intermediate notion we refer to as list learning. Leveraging this\nconnection: (1) approximately recovers previous results on Gaussians over\n$\\mathbb R^d$; and (2) leads to new ones, including sample complexity upper\nbounds for arbitrary $k$-mixtures of Gaussians over $\\mathbb R^d$, results for\nagnostic and distribution-shift resistant learners, as well as closure\nproperties for public-private learnability under taking mixtures and products\nof distributions. Finally, via the connection to list learning, we show that\nfor Gaussians in $\\mathbb R^d$, at least $d$ public samples are necessary for\nprivate learnability, which is close to the known upper bound of $d+1$ public\nsamples.\n","authors":["Shai Ben-David","Alex Bie","Clément L. Canonne","Gautam Kamath","Vikrant Singhal"],"pdf_url":"https://arxiv.org/pdf/2308.06239v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2211.05961v2","updated":"2023-08-11T17:08:02Z","published":"2022-11-11T02:14:29Z","title":"Inverse Kernel Decomposition","summary":"  The state-of-the-art dimensionality reduction approaches largely rely on\ncomplicated optimization procedures. On the other hand, closed-form approaches\nrequiring merely eigen-decomposition do not have enough sophistication and\nnonlinearity. In this paper, we propose a novel nonlinear dimensionality\nreduction method -- Inverse Kernel Decomposition (IKD) -- based on an\neigen-decomposition of the sample covariance matrix of data. The method is\ninspired by Gaussian process latent variable models (GPLVMs) and has comparable\nperformance with GPLVMs. To deal with very noisy data with weak correlations,\nwe propose two solutions -- blockwise and geodesic -- to make use of locally\ncorrelated data points and provide better and numerically more stable latent\nestimations. We use synthetic datasets and four real-world datasets to show\nthat IKD is a better dimensionality reduction method than other\neigen-decomposition-based methods, and achieves comparable performance against\noptimization-based methods with faster running speeds. Open-source IKD\nimplementation in Python can be accessed at this\n\\url{https://github.com/JerrySoybean/ikd}.\n","authors":["Chengrui Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2211.05961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06228v1","updated":"2023-08-11T16:58:57Z","published":"2023-08-11T16:58:57Z","title":"MaxFloodCast: Ensemble Machine Learning Model for Predicting Peak\n  Inundation Depth And Decoding Influencing Features","summary":"  Timely, accurate, and reliable information is essential for decision-makers,\nemergency managers, and infrastructure operators during flood events. This\nstudy demonstrates a proposed machine learning model, MaxFloodCast, trained on\nphysics-based hydrodynamic simulations in Harris County, offers efficient and\ninterpretable flood inundation depth predictions. Achieving an average\nR-squared of 0.949 and a Root Mean Square Error of 0.61 ft on unseen data, it\nproves reliable in forecasting peak flood inundation depths. Validated against\nHurricane Harvey and Storm Imelda, MaxFloodCast shows the potential in\nsupporting near-time floodplain management and emergency operations. The\nmodel's interpretability aids decision-makers in offering critical information\nto inform flood mitigation strategies, to prioritize areas with critical\nfacilities and to examine how rainfall in other watersheds influences flood\nexposure in one area. The MaxFloodCast model enables accurate and interpretable\ninundation depth predictions while significantly reducing computational time,\nthereby supporting emergency response efforts and flood risk management more\neffectively.\n","authors":["Cheng-Chun Lee","Lipai Huang","Federico Antolini","Matthew Garcia","Andrew Juanb","Samuel D. Brody","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2308.06228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06221v1","updated":"2023-08-11T16:48:31Z","published":"2023-08-11T16:48:31Z","title":"Automated Sizing and Training of Efficient Deep Autoencoders using\n  Second Order Algorithms","summary":"  We propose a multi-step training method for designing generalized linear\nclassifiers. First, an initial multi-class linear classifier is found through\nregression. Then validation error is minimized by pruning of unnecessary\ninputs. Simultaneously, desired outputs are improved via a method similar to\nthe Ho-Kashyap rule. Next, the output discriminants are scaled to be net\nfunctions of sigmoidal output units in a generalized linear classifier. We then\ndevelop a family of batch training algorithm for the multi layer perceptron\nthat optimizes its hidden layer size and number of training epochs. Next, we\ncombine pruning with a growing approach. Later, the input units are scaled to\nbe the net function of the sigmoidal output units that are then feed into as\ninput to the MLP. We then propose resulting improvements in each of the deep\nlearning blocks thereby improving the overall performance of the deep\narchitecture. We discuss the principles and formulation regarding learning\nalgorithms for deep autoencoders. We investigate several problems in deep\nautoencoders networks including training issues, the theoretical, mathematical\nand experimental justification that the networks are linear, optimizing the\nnumber of hidden units in each layer and determining the depth of the deep\nlearning model. A direct implication of the current work is the ability to\nconstruct fast deep learning models using desktop level computational\nresources. This, in our opinion, promotes our design philosophy of building\nsmall but powerful algorithms. Performance gains are demonstrated at each step.\nUsing widely available datasets, the final network's ten fold testing error is\nshown to be less than that of several other linear, generalized linear\nclassifiers, multi layer perceptron and deep learners reported in the\nliterature.\n","authors":["Kanishka Tyagi","Chinmay Rane","Michael Manry"],"pdf_url":"https://arxiv.org/pdf/2308.06221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v4","updated":"2023-08-11T16:40:41Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n  Learner Equation Modeling","summary":"  Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06213v1","updated":"2023-08-11T16:32:00Z","published":"2023-08-11T16:32:00Z","title":"Change Point Detection With Conceptors","summary":"  Offline change point detection seeks to identify points in a time series\nwhere the data generating process changes. This problem is well studied for\nunivariate i.i.d. data, but becomes challenging with increasing dimension and\ntemporal dependence. For the at most one change point problem, we propose the\nuse of a conceptor matrix to learn the characteristic dynamics of a specified\ntraining window in a time series. The associated random recurrent neural\nnetwork acts as a featurizer of the data, and change points are identified from\na univariate quantification of the distance between the featurization and the\nspace spanned by a representative conceptor matrix. This model agnostic method\ncan suggest potential locations of interest that warrant further study. We\nprove that, under mild assumptions, the method provides a consistent estimate\nof the true change point, and quantile estimates for statistics are produced\nvia a moving block bootstrap of the original data. The method is tested on\nsimulations from several classes of processes, and we evaluate performance with\nclustering metrics, graphical methods, and observed Type 1 error control. We\napply our method to publicly available neural data from rats experiencing bouts\nof non-REM sleep prior to exploration of a radial maze.\n","authors":["Noah D. Gade","Jordan Rodu"],"pdf_url":"https://arxiv.org/pdf/2308.06213v1.pdf","comment":"Main Text 30 pages, 9 figures; Supplementary Material 29 pages, 2\n  figures"},{"id":"http://arxiv.org/abs/2010.03322v3","updated":"2023-08-11T16:28:40Z","published":"2020-10-07T10:48:18Z","title":"A method for escaping limit cycles in training GANs","summary":"  This paper mainly conducts further research to alleviate the issue of limit\ncycling behavior in training generative adversarial networks (GANs) through the\nproposed predictive centripetal acceleration algorithm (PCAA). Specifically, we\nfirst derive the upper and lower bounds on the last-iterate convergence rates\nof PCAA for the general bilinear game, with the upper bound notably improving\nupon previous results. Then, we combine PCAA with the adaptive moment\nestimation algorithm (Adam) to propose PCAA-Adam, a practical approach for\ntraining GANs. Finally, we validate the effectiveness of the proposed algorithm\nthrough experiments conducted on bilinear games, multivariate Gaussian\ndistributions, and the CelebA dataset, respectively.\n","authors":["Li Keke","Yang Xinmin"],"pdf_url":"https://arxiv.org/pdf/2010.03322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06034v3","updated":"2023-08-11T16:17:54Z","published":"2023-06-09T16:55:49Z","title":"RANS-PINN based Simulation Surrogates for Predicting Turbulent Flows","summary":"  Physics-informed neural networks (PINNs) provide a framework to build\nsurrogate models for dynamical systems governed by differential equations.\nDuring the learning process, PINNs incorporate a physics-based regularization\nterm within the loss function to enhance generalization performance. Since\nsimulating dynamics controlled by partial differential equations (PDEs) can be\ncomputationally expensive, PINNs have gained popularity in learning parametric\nsurrogates for fluid flow problems governed by Navier-Stokes equations. In this\nwork, we introduce RANS-PINN, a modified PINN framework, to predict flow fields\n(i.e., velocity and pressure) in high Reynolds number turbulent flow regimes.\nTo account for the additional complexity introduced by turbulence, RANS-PINN\nemploys a 2-equation eddy viscosity model based on a Reynolds-averaged\nNavier-Stokes (RANS) formulation. Furthermore, we adopt a novel training\napproach that ensures effective initialization and balance among the various\ncomponents of the loss function. The effectiveness of the RANS-PINN framework\nis then demonstrated using a parametric PINN.\n","authors":["Shinjan Ghosh","Amit Chakraborty","Georgia Olympia Brikis","Biswadip Dey"],"pdf_url":"https://arxiv.org/pdf/2306.06034v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.03664v4","updated":"2023-08-11T16:15:45Z","published":"2021-11-05T14:14:05Z","title":"Oracle Teacher: Leveraging Target Information for Better Knowledge\n  Distillation of CTC Models","summary":"  Knowledge distillation (KD), best known as an effective method for model\ncompression, aims at transferring the knowledge of a bigger network (teacher)\nto a much smaller network (student). Conventional KD methods usually employ the\nteacher model trained in a supervised manner, where output labels are treated\nonly as targets. Extending this supervised scheme further, we introduce a new\ntype of teacher model for connectionist temporal classification (CTC)-based\nsequence models, namely Oracle Teacher, that leverages both the source inputs\nand the output labels as the teacher model's input. Since the Oracle Teacher\nlearns a more accurate CTC alignment by referring to the target information, it\ncan provide the student with more optimal guidance. One potential risk for the\nproposed approach is a trivial solution that the model's output directly copies\nthe target input. Based on a many-to-one mapping property of the CTC algorithm,\nwe present a training strategy that can effectively prevent the trivial\nsolution and thus enables utilizing both source and target inputs for model\ntraining. Extensive experiments are conducted on two sequence learning tasks:\nspeech recognition and scene text recognition. From the experimental results,\nwe empirically show that the proposed model improves the students across these\ntasks while achieving a considerable speed-up in the teacher model's training\ntime.\n","authors":["Ji Won Yoon","Hyung Yong Kim","Hyeonseung Lee","Sunghwan Ahn","Nam Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2111.03664v4.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech and Language\n  Processing"},{"id":"http://arxiv.org/abs/2308.06204v1","updated":"2023-08-11T16:09:12Z","published":"2023-08-11T16:09:12Z","title":"Safety in Traffic Management Systems: A Comprehensive Survey","summary":"  Traffic management systems play a vital role in ensuring safe and efficient\ntransportation on roads. However, the use of advanced technologies in traffic\nmanagement systems has introduced new safety challenges. Therefore, it is\nimportant to ensure the safety of these systems to prevent accidents and\nminimize their impact on road users. In this survey, we provide a comprehensive\nreview of the literature on safety in traffic management systems. Specifically,\nwe discuss the different safety issues that arise in traffic management\nsystems, the current state of research on safety in these systems, and the\ntechniques and methods proposed to ensure the safety of these systems. We also\nidentify the limitations of the existing research and suggest future research\ndirections.\n","authors":["Wenlu Du","Ankan Dash","Jing Li","Hua Wei","Guiling Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06204v1.pdf","comment":"Accepted by MDPI Designs journal, the Special Issue Design and\n  Application of Intelligent Transportation Systems. 30 pages, 6 figures,\n  published on 10 August 2023"},{"id":"http://arxiv.org/abs/2308.06203v1","updated":"2023-08-11T15:58:15Z","published":"2023-08-11T15:58:15Z","title":"Towards a Causal Probabilistic Framework for Prediction,\n  Action-Selection & Explanations for Robot Block-Stacking Tasks","summary":"  Uncertainties in the real world mean that is impossible for system designers\nto anticipate and explicitly design for all scenarios that a robot might\nencounter. Thus, robots designed like this are fragile and fail outside of\nhighly-controlled environments. Causal models provide a principled framework to\nencode formal knowledge of the causal relationships that govern the robot's\ninteraction with its environment, in addition to probabilistic representations\nof noise and uncertainty typically encountered by real-world robots. Combined\nwith causal inference, these models permit an autonomous agent to understand,\nreason about, and explain its environment. In this work, we focus on the\nproblem of a robot block-stacking task due to the fundamental perception and\nmanipulation capabilities it demonstrates, required by many applications\nincluding warehouse logistics and domestic human support robotics. We propose a\nnovel causal probabilistic framework to embed a physics simulation capability\ninto a structural causal model to permit robots to perceive and assess the\ncurrent state of a block-stacking task, reason about the next-best action from\nplacement candidates, and generate post-hoc counterfactual explanations. We\nprovide exemplar next-best action selection results and outline planned\nexperimentation in simulated and real-world robot block-stacking tasks.\n","authors":["Ricardo Cannizzaro","Jonathan Routley","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.06203v1.pdf","comment":"3 pages, 3 figures, accepted to the \"Causality for Robotics:\n  Answering the Question of Why\" workshop at the 2023 IEEE/RSJ International\n  Conference on Intelligent Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2308.06202v1","updated":"2023-08-11T15:57:45Z","published":"2023-08-11T15:57:45Z","title":"Exploring Predicate Visual Context in Detecting of Human-Object\n  Interactions","summary":"  Recently, the DETR framework has emerged as the dominant approach for\nhuman--object interaction (HOI) research. In particular, two-stage\ntransformer-based HOI detectors are amongst the most performant and\ntraining-efficient approaches. However, these often condition HOI\nclassification on object features that lack fine-grained contextual\ninformation, eschewing pose and orientation information in favour of visual\ncues about object identity and box extremities. This naturally hinders the\nrecognition of complex or ambiguous interactions. In this work, we study these\nissues through visualisations and carefully designed experiments. Accordingly,\nwe investigate how best to re-introduce image features via cross-attention.\nWith an improved query design, extensive exploration of keys and values, and\nbox pair positional embeddings as spatial guidance, our model with enhanced\npredicate visual context (PViC) outperforms state-of-the-art methods on the\nHICO-DET and V-COCO benchmarks, while maintaining low training cost.\n","authors":["Frederic Z. Zhang","Yuhui Yuan","Dylan Campbell","Zhuoyao Zhong","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2308.06202v1.pdf","comment":"To appear in ICCV2023"},{"id":"http://arxiv.org/abs/2303.06628v2","updated":"2023-08-11T15:56:32Z","published":"2023-03-12T10:28:07Z","title":"Preventing Zero-Shot Transfer Degradation in Continual Learning of\n  Vision-Language Models","summary":"  Continual learning (CL) can help pre-trained vision-language models\nefficiently adapt to new or under-trained data distributions without\nre-training. Nevertheless, during the continual training of the Contrastive\nLanguage-Image Pre-training (CLIP) model, we observe that the model's zero-shot\ntransfer ability significantly degrades due to catastrophic forgetting.\nExisting CL methods can mitigate forgetting by replaying previous data.\nHowever, since the CLIP dataset is private, replay methods cannot access the\npre-training dataset. In addition, replaying data of previously learned\ndownstream tasks can enhance their performance but comes at the cost of\nsacrificing zero-shot performance. To address this challenge, we propose a\nnovel method ZSCL to prevent zero-shot transfer degradation in the continual\nlearning of vision-language models in both feature and parameter space. In the\nfeature space, a reference dataset is introduced for distillation between the\ncurrent and initial models. The reference dataset should have semantic\ndiversity but no need to be labeled, seen in pre-training, or matched\nimage-text pairs. In parameter space, we prevent a large parameter shift by\naveraging weights during the training. We propose a more challenging\nMulti-domain Task Incremental Learning (MTIL) benchmark to evaluate different\nmethods, where tasks are from various domains instead of class-separated in a\nsingle dataset. Our method outperforms other methods in the traditional\nclass-incremental learning setting and the MTIL by 9.7% average score. Our code\nlocates at https://github.com/Thunderbeee/ZSCL.\n","authors":["Zangwei Zheng","Mingyuan Ma","Kai Wang","Ziheng Qin","Xiangyu Yue","Yang You"],"pdf_url":"https://arxiv.org/pdf/2303.06628v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06197v1","updated":"2023-08-11T15:42:48Z","published":"2023-08-11T15:42:48Z","title":"Complex Facial Expression Recognition Using Deep Knowledge Distillation\n  of Basic Features","summary":"  Complex emotion recognition is a cognitive task that has so far eluded the\nsame excellent performance of other tasks that are at or above the level of\nhuman cognition. Emotion recognition through facial expressions is particularly\ndifficult due to the complexity of emotions expressed by the human face. For a\nmachine to approach the same level of performance in this domain as a human, it\nmay need to synthesise knowledge and understand new concepts in real-time as\nhumans do. Humans are able to learn new concepts using only few examples, by\ndistilling the important information from memories and discarding the rest.\nSimilarly, continual learning methods learn new classes whilst retaining the\nknowledge of known classes, whilst few-shot learning methods are able to learn\nnew classes using very few training examples. We propose a novel continual\nlearning method inspired by human cognition and learning that can accurately\nrecognise new compound expression classes using few training samples, by\nbuilding on and retaining its knowledge of basic expression classes. Using\nGradCAM visualisations, we demonstrate the relationship between basic and\ncompound facial expressions, which our method leverages through knowledge\ndistillation and a novel Predictive Sorting Memory Replay. Our method achieves\nthe current state-of-the-art in continual learning for complex facial\nexpression recognition with 74.28% Overall Accuracy on new classes. We also\ndemonstrate that using continual learning for complex facial expression\nrecognition achieves far better performance than non-continual learning\nmethods, improving on state-of-the-art non-continual learning methods by\n13.95%. To the best of our knowledge, our work is also the first to apply\nfew-shot learning to complex facial expression recognition, achieving the\nstate-of-the-art with 100% accuracy using a single training sample for each\nexpression class.\n","authors":["Angus Maiden","Bahareh Nakisa"],"pdf_url":"https://arxiv.org/pdf/2308.06197v1.pdf","comment":"17 pages, 9 figures, 6 tables. Code available at\n  https://github.com/AngusMaiden/complex-FER"},{"id":"http://arxiv.org/abs/2308.04522v2","updated":"2023-08-11T15:39:03Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":"  Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09466v2","updated":"2023-08-11T15:30:29Z","published":"2023-04-19T07:27:21Z","title":"MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke\n  Diagnosis","summary":"  Stroke is a major cause of mortality and disability worldwide from which one\nin four people are in danger of incurring in their lifetime. The pre-hospital\nstroke assessment plays a vital role in identifying stroke patients accurately\nto accelerate further examination and treatment in hospitals. Accordingly, the\nNational Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital\nStroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests\nfor stroke assessment. However, the validity of these tests is skeptical in the\nabsence of neurologists and access to healthcare may be limited. Therefore, in\nthis study, we propose a motion-aware and multi-attention fusion network\n(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary\nto other studies on stroke detection from video analysis, our study for the\nfirst time proposes an end-to-end solution from multiple video recordings of\neach subject with a dataset encapsulating stroke, transient ischemic attack\n(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware\nmodules to sense the mobility of patients, attention modules to fuse the\nmulti-input video data, and 3D convolutional layers to perform diagnosis from\nthe attention-based extracted features. Experimental results over the collected\nStroke-data dataset show that the proposed MAMAF-Net achieves a successful\ndetection of stroke with 93.62% sensitivity and 95.33% AUC score.\n","authors":["Aysen Degerli","Pekka Jakala","Juha Pajula","Milla Immonen","Miguel Bordallo Lopez"],"pdf_url":"https://arxiv.org/pdf/2304.09466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1901.08571v3","updated":"2023-08-11T15:17:35Z","published":"2019-01-24T18:43:16Z","title":"Nonparametric Inference under B-bits Quantization","summary":"  Statistical inference based on lossy or incomplete samples is often needed in\nresearch areas such as signal/image processing, medical image storage, remote\nsensing, signal transmission. In this paper, we propose a nonparametric testing\nprocedure based on samples quantized to $B$ bits through a computationally\nefficient algorithm. Under mild technical conditions, we establish the\nasymptotic properties of the proposed test statistic and investigate how the\ntesting power changes as $B$ increases. In particular, we show that if $B$\nexceeds a certain threshold, the proposed nonparametric testing procedure\nachieves the classical minimax rate of testing (Shang and Cheng, 2015) for\nspline models. We further extend our theoretical investigations to a\nnonparametric linearity test and an adaptive nonparametric test, expanding the\napplicability of the proposed methods. Extensive simulation studies {together\nwith a real-data analysis} are used to demonstrate the validity and\neffectiveness of the proposed tests.\n","authors":["Kexuan Li","Ruiqi Liu","Ganggang Xu","Zuofeng Shang"],"pdf_url":"https://arxiv.org/pdf/1901.08571v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06175v1","updated":"2023-08-11T15:04:34Z","published":"2023-08-11T15:04:34Z","title":"Assessing Guest Nationality Composition from Hotel Reviews","summary":"  Many hotels target guest acquisition efforts to specific markets in order to\nbest anticipate individual preferences and needs of their guests. Likewise,\nsuch strategic positioning is a prerequisite for efficient marketing budget\nallocation. Official statistics report on the number of visitors from different\ncountries, but no fine-grained information on the guest composition of\nindividual businesses exists. There is, however, growing interest in such data\nfrom competitors, suppliers, researchers and the general public. We demonstrate\nhow machine learning can be leveraged to extract references to guest\nnationalities from unstructured text reviews in order to dynamically assess and\nmonitor the dynamics of guest composition of individual businesses. In\nparticular, we show that a rather simple architecture of pre-trained embeddings\nand stacked LSTM layers provides a better performance-runtime tradeoff than\nmore complex state-of-the-art language models.\n","authors":["Fabian Gröger","Marc Pouly","Flavia Tinner","Leif Brandes"],"pdf_url":"https://arxiv.org/pdf/2308.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02302v3","updated":"2023-08-11T15:02:43Z","published":"2021-11-03T15:38:58Z","title":"Selecting the number of clusters, clustering models, and algorithms. A\n  unifying approach based on the quadratic discriminant score","summary":"  Cluster analysis requires many decisions: the clustering method and the\nimplied reference model, the number of clusters and, often, several\nhyper-parameters and algorithms' tunings. In practice, one produces several\npartitions, and a final one is chosen based on validation or selection\ncriteria. There exist an abundance of validation methods that, implicitly or\nexplicitly, assume a certain clustering notion. Moreover, they are often\nrestricted to operate on partitions obtained from a specific method. In this\npaper, we focus on groups that can be well separated by quadratic or linear\nboundaries. The reference cluster concept is defined through the quadratic\ndiscriminant score function and parameters describing clusters' size, center\nand scatter. We develop two cluster-quality criteria called quadratic scores.\nWe show that these criteria are consistent with groups generated from a general\nclass of elliptically-symmetric distributions. The quest for this type of\ngroups is common in applications. The connection with likelihood theory for\nmixture models and model-based clustering is investigated. Based on bootstrap\nresampling of the quadratic scores, we propose a selection rule that allows\nchoosing among many clustering solutions. The proposed method has the\ndistinctive advantage that it can compare partitions that cannot be compared\nwith other state-of-the-art methods. Extensive numerical experiments and the\nanalysis of real data show that, even if some competing methods turn out to be\nsuperior in some setups, the proposed methodology achieves a better overall\nperformance.\n","authors":["Luca Coraggio","Pietro Coretto"],"pdf_url":"https://arxiv.org/pdf/2111.02302v3.pdf","comment":"Supplemental materials are included at the end of the paper"},{"id":"http://arxiv.org/abs/2308.06173v1","updated":"2023-08-11T15:02:19Z","published":"2023-08-11T15:02:19Z","title":"Physical Adversarial Attacks For Camera-based Smart Systems: Current\n  Trends, Categorization, Applications, Research Challenges, and Future Outlook","summary":"  In this paper, we present a comprehensive survey of the current trends\nfocusing specifically on physical adversarial attacks. We aim to provide a\nthorough understanding of the concept of physical adversarial attacks,\nanalyzing their key characteristics and distinguishing features. Furthermore,\nwe explore the specific requirements and challenges associated with executing\nattacks in the physical world. Our article delves into various physical\nadversarial attack methods, categorized according to their target tasks in\ndifferent applications, including classification, detection, face recognition,\nsemantic segmentation and depth estimation. We assess the performance of these\nattack methods in terms of their effectiveness, stealthiness, and robustness.\nWe examine how each technique strives to ensure the successful manipulation of\nDNNs while mitigating the risk of detection and withstanding real-world\ndistortions. Lastly, we discuss the current challenges and outline potential\nfuture research directions in the field of physical adversarial attacks. We\nhighlight the need for enhanced defense mechanisms, the exploration of novel\nattack strategies, the evaluation of attacks in different application domains,\nand the establishment of standardized benchmarks and evaluation criteria for\nphysical adversarial attacks. Through this comprehensive survey, we aim to\nprovide a valuable resource for researchers, practitioners, and policymakers to\ngain a holistic understanding of physical adversarial attacks in computer\nvision and facilitate the development of robust and secure DNN-based systems.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammed Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.06173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12364v3","updated":"2023-08-11T14:59:36Z","published":"2023-03-22T08:03:27Z","title":"ExBEHRT: Extended Transformer for Electronic Health Records to Predict\n  Disease Subtypes & Progressions","summary":"  In this study, we introduce ExBEHRT, an extended version of BEHRT (BERT\napplied to electronic health records), and apply different algorithms to\ninterpret its results. While BEHRT considers only diagnoses and patient age, we\nextend the feature space to several multimodal records, namely demographics,\nclinical characteristics, vital signs, smoking status, diagnoses, procedures,\nmedications, and laboratory tests, by applying a novel method to unify the\nfrequencies and temporal dimensions of the different features. We show that\nadditional features significantly improve model performance for various\ndownstream tasks in different diseases. To ensure robustness, we interpret\nmodel predictions using an adaptation of expected gradients, which has not been\npreviously applied to transformers with EHR data and provides more granular\ninterpretations than previous approaches such as feature and token importances.\nFurthermore, by clustering the model representations of oncology patients, we\nshow that the model has an implicit understanding of the disease and is able to\nclassify patients with the same cancer type into different risk groups. Given\nthe additional features and interpretability, ExBEHRT can help make informed\ndecisions about disease trajectories, diagnoses, and risk factors of various\ndiseases.\n","authors":["Maurice Rupp","Oriane Peter","Thirupathi Pattipaka"],"pdf_url":"https://arxiv.org/pdf/2303.12364v3.pdf","comment":"ICLR 2023 Workshop on Trustworthy Machine Learning for Healthcare\n  (Website: https://sites.google.com/view/tml4h2023/accepted-papers )"},{"id":"http://arxiv.org/abs/2308.06155v1","updated":"2023-08-11T14:33:20Z","published":"2023-08-11T14:33:20Z","title":"Phased Deep Spatio-temporal Learning for Highway Traffic Volume\n  Prediction","summary":"  Inter-city highway transportation is significant for citizens' modern urban\nlife and generates heterogeneous sensory data with spatio-temporal\ncharacteristics. As a routine analysis in transportation domain, daily traffic\nvolume estimation faces challenges for highway toll stations including lacking\nof exploration of correlative spatio-temporal features from a long-term\nperspective and effective means to deal with data imbalance which always\ndeteriorates the predictive performance. In this paper, a deep spatio-temporal\nlearning method is proposed to predict daily traffic volume in three phases. In\nfeature pre-processing phase, data is normalized elaborately according to\nlatent long-tail distribution. In spatio-temporal learning phase, a hybrid\nmodel is employed combining fully convolution network (FCN) and long short-term\nmemory (LSTM), which considers time, space, meteorology, and calendar from\nheterogeneous data. In decision phase, traffic volumes on a coming day at\nnetwork-wide toll stations would be achieved effectively, which is especially\ncalibrated for vital few highway stations. Using real-world data from one\nChinese provincial highway, extensive experiments show our method has distinct\nimprovement for predictive accuracy than various traditional models, reaching\n5.269 and 0.997 in MPAE and R-squre metrics, respectively.\n","authors":["Weilong Ding","Tianpu Zhang","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06149v1","updated":"2023-08-11T14:26:29Z","published":"2023-08-11T14:26:29Z","title":"Gaussian Process Regression for Maximum Entropy Distribution","summary":"  Maximum-Entropy Distributions offer an attractive family of probability\ndensities suitable for moment closure problems. Yet finding the Lagrange\nmultipliers which parametrize these distributions, turns out to be a\ncomputational bottleneck for practical closure settings. Motivated by recent\nsuccess of Gaussian processes, we investigate the suitability of Gaussian\npriors to approximate the Lagrange multipliers as a map of a given set of\nmoments. Examining various kernel functions, the hyperparameters are optimized\nby maximizing the log-likelihood. The performance of the devised data-driven\nMaximum-Entropy closure is studied for couple of test cases including\nrelaxation of non-equilibrium distributions governed by Bhatnagar-Gross-Krook\nand Boltzmann kinetic equations.\n","authors":["Mohsen Sadr","Manuel Torrilhon","M. Hossein Gorji"],"pdf_url":"https://arxiv.org/pdf/2308.06149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n  and Transformer Based Models","summary":"  The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06142v1","updated":"2023-08-11T14:02:52Z","published":"2023-08-11T14:02:52Z","title":"CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging\n  Handwritten Documents using Deep Feature Learning from JPEG Coefficients","summary":"  Automatic localization of text-lines in handwritten documents is still an\nopen and challenging research problem. Various writing issues such as uneven\nspacing between the lines, oscillating and touching text, and the presence of\nskew become much more challenging when the case of complex handwritten document\nimages are considered for segmentation directly in their respective compressed\nrepresentation. This is because, the conventional way of processing compressed\ndocuments is through decompression, but here in this paper, we propose an idea\nthat employs deep feature learning directly from the JPEG compressed\ncoefficients without full decompression to accomplish text-line localization in\nthe JPEG compressed domain. A modified U-Net architecture known as Compressed\nText-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The\nmodel is trained and tested with JPEG compressed version of benchmark datasets\nincluding ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art\nperformance with reduced storage and computational costs in the JPEG compressed\ndomain.\n","authors":["Bulla Rajesh","Sk Mahafuz Zaman","Mohammed Javed","P. Nagabhushan"],"pdf_url":"https://arxiv.org/pdf/2308.06142v1.pdf","comment":"Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),\n  5-8 November 2023, Kitakyushu, Japan"},{"id":"http://arxiv.org/abs/2308.06138v1","updated":"2023-08-11T13:58:42Z","published":"2023-08-11T13:58:42Z","title":"Application of Artificial Neural Networks for Investigation of Pressure\n  Filtration Performance, a Zinc Leaching Filter Cake Moisture Modeling","summary":"  Machine Learning (ML) is a powerful tool for material science applications.\nArtificial Neural Network (ANN) is a machine learning technique that can\nprovide high prediction accuracy. This study aimed to develop an ANN model to\npredict the cake moisture of the pressure filtration process of zinc\nproduction. The cake moisture was influenced by seven parameters: temperature\n(35 and 65 Celsius), solid concentration (0.2 and 0.38 g/L), pH (2, 3.5, and\n5), air-blow time (2, 10, and 15 min), cake thickness (14, 20, 26, and 34 mm),\npressure, and filtration time. The study conducted 288 tests using two types of\nfabrics: polypropylene (S1) and polyester (S2). The ANN model was evaluated by\nthe Coefficient of determination (R2), the Mean Square Error (MSE), and the\nMean Absolute Error (MAE) metrics for both datasets. The results showed R2\nvalues of 0.88 and 0.83, MSE values of 6.243x10-07 and 1.086x10-06, and MAE\nvalues of 0.00056 and 0.00088 for S1 and S2, respectively. These results\nindicated that the ANN model could predict the cake moisture of pressure\nfiltration in the zinc leaching process with high accuracy.\n","authors":["Masoume Kazemi","Davood Moradkhani","Alireza A. Alipour"],"pdf_url":"https://arxiv.org/pdf/2308.06138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02374v2","updated":"2023-08-11T13:51:55Z","published":"2022-12-05T15:56:08Z","title":"On the Trade-off between Over-smoothing and Over-squashing in Deep Graph\n  Neural Networks","summary":"  Graph Neural Networks (GNNs) have succeeded in various computer science\napplications, yet deep GNNs underperform their shallow counterparts despite\ndeep learning's success in other domains. Over-smoothing and over-squashing are\nkey challenges when stacking graph convolutional layers, hindering deep\nrepresentation learning and information propagation from distant nodes. Our\nwork reveals that over-smoothing and over-squashing are intrinsically related\nto the spectral gap of the graph Laplacian, resulting in an inevitable\ntrade-off between these two issues, as they cannot be alleviated\nsimultaneously. To achieve a suitable compromise, we propose adding and\nremoving edges as a viable approach. We introduce the Stochastic Jost and Liu\nCurvature Rewiring (SJLR) algorithm, which is computationally efficient and\npreserves fundamental properties compared to previous curvature-based methods.\nUnlike existing approaches, SJLR performs edge addition and removal during GNN\ntraining while maintaining the graph unchanged during testing. Comprehensive\ncomparisons demonstrate SJLR's competitive performance in addressing\nover-smoothing and over-squashing.\n","authors":["Jhony H. Giraldo","Konstantinos Skianis","Thierry Bouwmans","Fragkiskos D. Malliaros"],"pdf_url":"https://arxiv.org/pdf/2212.02374v2.pdf","comment":"This paper has been accepted for publication at the 32nd ACM\n  International Conference on Information and Knowledge Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2302.14057v2","updated":"2023-08-11T13:48:44Z","published":"2023-02-25T10:12:34Z","title":"Cross-modal Contrastive Learning for Multimodal Fake News Detection","summary":"  Automatic detection of multimodal fake news has gained a widespread attention\nrecently. Many existing approaches seek to fuse unimodal features to produce\nmultimodal news representations. However, the potential of powerful cross-modal\ncontrastive learning methods for fake news detection has not been well\nexploited. Besides, how to aggregate features from different modalities to\nboost the performance of the decision-making process is still an open question.\nTo address that, we propose COOLANT, a cross-modal contrastive learning\nframework for multimodal fake news detection, aiming to achieve more accurate\nimage-text alignment. To further improve the alignment precision, we leverage\nan auxiliary task to soften the loss term of negative samples during the\ncontrast process. A cross-modal fusion module is developed to learn the\ncross-modality correlations. An attention mechanism with an attention guidance\nmodule is implemented to help effectively and interpretably aggregate the\naligned unimodal representations and the cross-modality correlations. Finally,\nwe evaluate the COOLANT and conduct a comparative study on two widely used\ndatasets, Twitter and Weibo. The experimental results demonstrate that our\nCOOLANT outperforms previous approaches by a large margin and achieves new\nstate-of-the-art results on the two datasets.\n","authors":["Longzheng Wang","Chuang Zhang","Hongbo Xu","Yongxiu Xu","Xiaohan Xu","Siqi Wang"],"pdf_url":"https://arxiv.org/pdf/2302.14057v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.14029v2","updated":"2023-08-11T13:40:46Z","published":"2023-03-24T14:42:42Z","title":"PENTACET data -- 23 Million Contextual Code Comments and 250,000 SATD\n  comments","summary":"  Most Self-Admitted Technical Debt (SATD) research utilizes explicit SATD\nfeatures such as 'TODO' and 'FIXME' for SATD detection. A closer look reveals\nseveral SATD research uses simple SATD ('Easy to Find') code comments without\nthe contextual data (preceding and succeeding source code context). This work\naddresses this gap through PENTACET (or 5C dataset) data. PENTACET is a large\nCurated Contextual Code Comments per Contributor and the most extensive SATD\ndata. We mine 9,096 Open Source Software Java projects with a total of 435\nmillion LOC. The outcome is a dataset with 23 million code comments, preceding\nand succeeding source code context for each comment, and more than 250,000\ncomments labeled as SATD, including both 'Easy to Find' and 'Hard to Find'\nSATD. We believe PENTACET data will further SATD research using Artificial\nIntelligence techniques.\n","authors":["Murali Sridharan","Leevi Rantala","Mika Mäntylä"],"pdf_url":"https://arxiv.org/pdf/2303.14029v2.pdf","comment":"Accepted in MSR 2023 Tools and Data Showcase"},{"id":"http://arxiv.org/abs/2308.06132v1","updated":"2023-08-11T13:39:21Z","published":"2023-08-11T13:39:21Z","title":"PDE Discovery for Soft Sensors Using Coupled Physics-Informed Neural\n  Network with Akaike's Information Criterion","summary":"  Soft sensors have been extensively used to monitor key variables using\neasy-to-measure variables and mathematical models. Partial differential\nequations (PDEs) are model candidates for soft sensors in industrial processes\nwith spatiotemporal dependence. However, gaps often exist between idealized\nPDEs and practical situations. Discovering proper structures of PDEs, including\nthe differential operators and source terms, can remedy the gaps. To this end,\na coupled physics-informed neural network with Akaike's criterion information\n(CPINN-AIC) is proposed for PDE discovery of soft sensors. First, CPINN is\nadopted for obtaining solutions and source terms satisfying PDEs. Then, we\npropose a data-physics-hybrid loss function for training CPINN, in which\nundetermined combinations of differential operators are involved. Consequently,\nAIC is used to discover the proper combination of differential operators.\nFinally, the artificial and practical datasets are used to verify the\nfeasibility and effectiveness of CPINN-AIC for soft sensors. The proposed\nCPINN-AIC is a data-driven method to discover proper PDE structures and neural\nnetwork-based solutions for soft sensors.\n","authors":["Aina Wang","Pan Qin","Xi-Ming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04431v3","updated":"2023-08-11T13:39:06Z","published":"2023-06-07T13:41:55Z","title":"Faithful Knowledge Distillation","summary":"  Knowledge distillation (KD) has received much attention due to its success in\ncompressing networks to allow for their deployment in resource-constrained\nsystems. While the problem of adversarial robustness has been studied before in\nthe KD setting, previous works overlook what we term the relative calibration\nof the student network with respect to its teacher in terms of soft\nconfidences. In particular, we focus on two crucial questions with regard to a\nteacher-student pair: (i) do the teacher and student disagree at points close\nto correctly classified dataset examples, and (ii) is the distilled student as\nconfident as the teacher around dataset examples? These are critical questions\nwhen considering the deployment of a smaller student network trained from a\nrobust teacher within a safety-critical setting. To address these questions, we\nintroduce a faithful imitation framework to discuss the relative calibration of\nconfidences and provide empirical and certified methods to evaluate the\nrelative calibration of a student w.r.t. its teacher. Further, to verifiably\nalign the relative calibration incentives of the student to those of its\nteacher, we introduce faithful distillation. Our experiments on the MNIST,\nFashion-MNIST and CIFAR-10 datasets demonstrate the need for such an analysis\nand the advantages of the increased verifiability of faithful distillation over\nalternative adversarial distillation methods.\n","authors":["Tom A. Lamb","Rudy Brunel","Krishnamurthy DJ Dvijotham","M. Pawan Kumar","Philip H. S. Torr","Francisco Eiras"],"pdf_url":"https://arxiv.org/pdf/2306.04431v3.pdf","comment":"7pgs (main content), 4 figures"},{"id":"http://arxiv.org/abs/2308.06129v1","updated":"2023-08-11T13:35:52Z","published":"2023-08-11T13:35:52Z","title":"Uncertainty Quantification for Image-based Traffic Prediction across\n  Cities","summary":"  Despite the strong predictive performance of deep learning models for traffic\nprediction, their widespread deployment in real-world intelligent\ntransportation systems has been restrained by a lack of interpretability.\nUncertainty quantification (UQ) methods provide an approach to induce\nprobabilistic reasoning, improve decision-making and enhance model deployment\npotential. To gain a comprehensive picture of the usefulness of existing UQ\nmethods for traffic prediction and the relation between obtained uncertainties\nand city-wide traffic dynamics, we investigate their application to a\nlarge-scale image-based traffic dataset spanning multiple cities and time\nperiods. We compare two epistemic and two aleatoric UQ methods on both temporal\nand spatio-temporal transfer tasks, and find that meaningful uncertainty\nestimates can be recovered. We further demonstrate how uncertainty estimates\ncan be employed for unsupervised outlier detection on changes in city traffic\ndynamics. We find that our approach can capture both temporal and spatial\neffects on traffic behaviour in a representative case study for the city of\nMoscow. Our work presents a further step towards boosting uncertainty awareness\nin traffic prediction tasks, and aims to highlight the value contribution of UQ\nmethods to a better understanding of city traffic dynamics.\n","authors":["Alexander Timans","Nina Wiedemann","Nishant Kumar","Ye Hong","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2308.06129v1.pdf","comment":"39 pages, 22 figures. Code publicly available at:\n  https://github.com/alextimans/traffic4cast-uncertainty"},{"id":"http://arxiv.org/abs/2308.06127v1","updated":"2023-08-11T13:33:59Z","published":"2023-08-11T13:33:59Z","title":"Learning Control Policies for Variable Objectives from Offline Data","summary":"  Offline reinforcement learning provides a viable approach to obtain advanced\ncontrol strategies for dynamical systems, in particular when direct interaction\nwith the environment is not available. In this paper, we introduce a conceptual\nextension for model-based policy search methods, called variable objective\npolicy (VOP). With this approach, policies are trained to generalize\nefficiently over a variety of objectives, which parameterize the reward\nfunction. We demonstrate that by altering the objectives passed as input to the\npolicy, users gain the freedom to adjust its behavior or re-balance\noptimization targets at runtime, without need for collecting additional\nobservation batches or re-training.\n","authors":["Marc Weber","Phillip Swazinna","Daniel Hein","Steffen Udluft","Volkmar Sterzing"],"pdf_url":"https://arxiv.org/pdf/2308.06127v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2202.04936v4","updated":"2023-08-11T13:05:07Z","published":"2022-02-10T10:06:22Z","title":"Robust Graph Representation Learning for Local Corruption Recovery","summary":"  The performance of graph representation learning is affected by the quality\nof graph input. While existing research usually pursues a globally smoothed\ngraph embedding, we believe the rarely observed anomalies are as well harmful\nto an accurate prediction. This work establishes a graph learning scheme that\nautomatically detects (locally) corrupted feature attributes and recovers\nrobust embedding for prediction tasks. The detection operation leverages a\ngraph autoencoder, which does not make any assumptions about the distribution\nof the local corruptions. It pinpoints the positions of the anomalous node\nattributes in an unbiased mask matrix, where robust estimations are recovered\nwith sparsity promoting regularizer. The optimizer approaches a new embedding\nthat is sparse in the framelet domain and conditionally close to input\nobservations. Extensive experiments are provided to validate our proposed model\ncan recover a robust graph representation from black-box poisoning and achieve\nexcellent performance.\n","authors":["Bingxin Zhou","Yuanhong Jiang","Yu Guang Wang","Jingwei Liang","Junbin Gao","Shirui Pan","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2202.04936v4.pdf","comment":"WWW '23: Proceedings of the ACM Web Conference 2023"},{"id":"http://arxiv.org/abs/2308.06106v1","updated":"2023-08-11T12:43:43Z","published":"2023-08-11T12:43:43Z","title":"Hawkes Processes with Delayed Granger Causality","summary":"  We aim to explicitly model the delayed Granger causal effects based on\nmultivariate Hawkes processes. The idea is inspired by the fact that a causal\nevent usually takes some time to exert an effect. Studying this time lag itself\nis of interest. Given the proposed model, we first prove the identifiability of\nthe delay parameter under mild conditions. We further investigate a model\nestimation method under a complex setting, where we want to infer the posterior\ndistribution of the time lags and understand how this distribution varies\nacross different scenarios. We treat the time lags as latent variables and\nformulate a Variational Auto-Encoder (VAE) algorithm to approximate the\nposterior distribution of the time lags. By explicitly modeling the time lags\nin Hawkes processes, we add flexibility to the model. The inferred time-lag\nposterior distributions are of scientific meaning and help trace the original\ncausal time that supports the root cause analysis. We empirically evaluate our\nmodel's event prediction and time-lag inference accuracy on synthetic and real\ndata, achieving promising results.\n","authors":["Chao Yang","Hengyuan Miao","Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06106v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2303.15109v2","updated":"2023-08-11T12:27:42Z","published":"2023-03-27T11:26:34Z","title":"Improving the Transferability of Adversarial Examples via Direction\n  Tuning","summary":"  In the transfer-based adversarial attacks, adversarial examples are only\ngenerated by the surrogate models and achieve effective perturbation in the\nvictim models. Although considerable efforts have been developed on improving\nthe transferability of adversarial examples generated by transfer-based\nadversarial attacks, our investigation found that, the big deviation between\nthe actual and steepest update directions of the current transfer-based\nadversarial attacks is caused by the large update step length, resulting in the\ngenerated adversarial examples can not converge well. However, directly\nreducing the update step length will lead to serious update oscillation so that\nthe generated adversarial examples also can not achieve great transferability\nto the victim models. To address these issues, a novel transfer-based attack,\nnamely direction tuning attack, is proposed to not only decrease the update\ndeviation in the large step length, but also mitigate the update oscillation in\nthe small sampling step length, thereby making the generated adversarial\nexamples converge well to achieve great transferability on victim models. In\naddition, a network pruning method is proposed to smooth the decision boundary,\nthereby further decreasing the update oscillation and enhancing the\ntransferability of the generated adversarial examples. The experiment results\non ImageNet demonstrate that the average attack success rate (ASR) of the\nadversarial examples generated by our method can be improved from 87.9\\% to\n94.5\\% on five victim models without defenses, and from 69.1\\% to 76.2\\% on\neight advanced defense methods, in comparison with that of latest\ngradient-based attacks.\n","authors":["Xiangyuan Yang","Jie Lin","Hanlin Zhang","Xinyu Yang","Peng Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.15109v2.pdf","comment":"Accepted by INS 2023"},{"id":"http://arxiv.org/abs/2308.06103v1","updated":"2023-08-11T12:27:22Z","published":"2023-08-11T12:27:22Z","title":"Composable Function-preserving Expansions for Transformer Architectures","summary":"  Training state-of-the-art neural networks requires a high cost in terms of\ncompute and time. Model scale is recognized to be a critical factor to achieve\nand improve the state-of-the-art. Increasing the scale of a neural network\nnormally requires restarting from scratch by randomly initializing all the\nparameters of the model, as this implies a change of architecture's parameters\nthat does not allow for a straightforward transfer of knowledge from smaller\nsize models. In this work, we propose six composable transformations to\nincrementally increase the size of transformer-based neural networks while\npreserving functionality, allowing to expand the capacity of the model as\nneeded. We provide proof of exact function preservation under minimal\ninitialization constraints for each transformation. The proposed methods may\nenable efficient training pipelines for larger and more powerful models by\nprogressively expanding the architecture throughout training.\n","authors":["Andrea Gesmundo","Kaitlin Maile"],"pdf_url":"https://arxiv.org/pdf/2308.06103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06100v1","updated":"2023-08-11T12:22:37Z","published":"2023-08-11T12:22:37Z","title":"Diffusion-based Visual Counterfactual Explanations -- Towards Systematic\n  Quantitative Evaluation","summary":"  Latest methods for visual counterfactual explanations (VCE) harness the power\nof deep generative models to synthesize new examples of high-dimensional images\nof impressive quality. However, it is currently difficult to compare the\nperformance of these VCE methods as the evaluation procedures largely vary and\noften boil down to visual inspection of individual examples and small scale\nuser studies. In this work, we propose a framework for systematic, quantitative\nevaluation of the VCE methods and a minimal set of metrics to be used. We use\nthis framework to explore the effects of certain crucial design choices in the\nlatest diffusion-based generative models for VCEs of natural image\nclassification (ImageNet). We conduct a battery of ablation-like experiments,\ngenerating thousands of VCEs for a suite of classifiers of various complexity,\naccuracy and robustness. Our findings suggest multiple directions for future\nadvancements and improvements of VCE methods. By sharing our methodology and\nour approach to tackle the computational challenges of such a study on a\nlimited hardware setup (including the complete code base), we offer a valuable\nguidance for researchers in the field fostering consistency and transparency in\nthe assessment of counterfactual explanations.\n","authors":["Philipp Vaeth","Alexander M. Fruehwald","Benjamin Paassen","Magda Gregorova"],"pdf_url":"https://arxiv.org/pdf/2308.06100v1.pdf","comment":"Accepted at the 5th International Workshop on eXplainable Knowledge\n  Discovery in Data Mining @ ECML 2023"},{"id":"http://arxiv.org/abs/2306.04542v2","updated":"2023-08-11T12:20:50Z","published":"2023-06-07T15:46:47Z","title":"On the Design Fundamentals of Diffusion Models: A Survey","summary":"  Diffusion models are generative models, which gradually add and remove noise\nto learn the underlying distribution of training data for data generation. The\ncomponents of diffusion models have gained significant attention with many\ndesign choices proposed. Existing reviews have primarily focused on\nhigher-level solutions, thereby covering less on the design fundamentals of\ncomponents. This study seeks to address this gap by providing a comprehensive\nand coherent review on component-wise design choices in diffusion models.\nSpecifically, we organize this review according to their three key components,\nnamely the forward process, the reverse process, and the sampling procedure.\nThis allows us to provide a fine-grained perspective of diffusion models,\nbenefiting future studies in the analysis of individual components, the\napplicability of design choices, and the implementation of diffusion models.\n","authors":["Ziyi Chang","George Alex Koulieris","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2306.04542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06095v1","updated":"2023-08-11T12:07:45Z","published":"2023-08-11T12:07:45Z","title":"Neural Conversation Models and How to Rein Them in: A Survey of Failures\n  and Fixes","summary":"  Recent conditional language models are able to continue any kind of text\nsource in an often seemingly fluent way. This fact encouraged research in the\narea of open-domain conversational systems that are based on powerful language\nmodels and aim to imitate an interlocutor by generating appropriate\ncontributions to a written dialogue. From a linguistic perspective, however,\nthe complexity of contributing to a conversation is high. In this survey, we\ninterpret Grice's maxims of cooperative conversation from the perspective of\nthis specific research area and systematize the literature under the aspect of\nwhat makes a contribution appropriate: A neural conversation model has to be\nfluent, informative, consistent, coherent, and follow social norms. In order to\nensure these qualities, recent approaches try to tame the underlying language\nmodels at various intervention points, such as data, training regime or\ndecoding. Sorted by these categories and intervention points, we discuss\npromising attempts and suggest novel ways for future research.\n","authors":["Fabian Galetzka","Anne Beyer","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2308.06095v1.pdf","comment":"Represents the state of the field in 2022; partially based on the\n  first authors 2022 PhD thesis"},{"id":"http://arxiv.org/abs/2308.06094v1","updated":"2023-08-11T12:05:32Z","published":"2023-08-11T12:05:32Z","title":"Reinforcement Logic Rule Learning for Temporal Point Processes","summary":"  We propose a framework that can incrementally expand the explanatory temporal\nlogic rule set to explain the occurrence of temporal events. Leveraging the\ntemporal point process modeling and learning framework, the rule content and\nweights will be gradually optimized until the likelihood of the observational\nevent sequences is optimal. The proposed algorithm alternates between a master\nproblem, where the current rule set weights are updated, and a subproblem,\nwhere a new rule is searched and included to best increase the likelihood. The\nformulated master problem is convex and relatively easy to solve using\ncontinuous optimization, whereas the subproblem requires searching the huge\ncombinatorial rule predicate and relationship space. To tackle this challenge,\nwe propose a neural search policy to learn to generate the new rule content as\na sequence of actions. The policy parameters will be trained end-to-end using\nthe reinforcement learning framework, where the reward signals can be\nefficiently queried by evaluating the subproblem objective. The trained policy\ncan be used to generate new rules in a controllable way. We evaluate our\nmethods on both synthetic and real healthcare datasets, obtaining promising\nresults.\n","authors":["Chao Yang","Lu Wang","Kun Gao","Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06094v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2308.06093v1","updated":"2023-08-11T12:05:12Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n  Transformers","summary":"  Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.06091v1","updated":"2023-08-11T12:04:36Z","published":"2023-08-11T12:04:36Z","title":"Toward a Better Understanding of Loss Functions for Collaborative\n  Filtering","summary":"  Collaborative filtering (CF) is a pivotal technique in modern recommender\nsystems. The learning process of CF models typically consists of three\ncomponents: interaction encoder, loss function, and negative sampling. Although\nmany existing studies have proposed various CF models to design sophisticated\ninteraction encoders, recent work shows that simply reformulating the loss\nfunctions can achieve significant performance gains. This paper delves into\nanalyzing the relationship among existing loss functions. Our mathematical\nanalysis reveals that the previous loss functions can be interpreted as\nalignment and uniformity functions: (i) the alignment matches user and item\nrepresentations, and (ii) the uniformity disperses user and item distributions.\nInspired by this analysis, we propose a novel loss function that improves the\ndesign of alignment and uniformity considering the unique patterns of datasets\ncalled Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty\nof MAWU is two-fold: (i) margin-aware alignment (MA) mitigates\nuser/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts\nthe significance between user and item uniformities to reflect the inherent\ncharacteristics of datasets. Extensive experimental results show that MF and\nLightGCN equipped with MAWU are comparable or superior to state-of-the-art CF\nmodels with various loss functions on three public datasets.\n","authors":["Seongmin Park","Mincheol Yoon","Jae-woong Lee","Hogun Park","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06091v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2210.16192v3","updated":"2023-08-11T11:58:28Z","published":"2022-10-27T12:59:00Z","title":"Pretraining Respiratory Sound Representations using Metadata and\n  Contrastive Learning","summary":"  Methods based on supervised learning using annotations in an end-to-end\nfashion have been the state-of-the-art for classification problems. However,\nthey may be limited in their generalization capability, especially in the low\ndata regime. In this study, we address this issue using supervised contrastive\nlearning combined with available metadata to solve multiple pretext tasks that\nlearn a good representation of data. We apply our approach on respiratory sound\nclassification. This task is suited for this setting as demographic information\nsuch as sex and age are correlated with presence of lung diseases, and learning\na system that implicitly encode this information may better detect anomalies.\nSupervised contrastive learning is a paradigm that learns similar\nrepresentations to samples sharing the same class labels and dissimilar\nrepresentations to samples with different class labels. The feature extractor\nlearned using this paradigm extract useful features from the data, and we show\nthat it outperforms cross-entropy in classifying respiratory anomalies in two\ndifferent datasets. We also show that learning representations using only\nmetadata, without class labels, obtains similar performance as using cross\nentropy with those labels only. In addition, when combining class labels with\nmetadata using multiple supervised contrastive learning, an extension of\nsupervised contrastive learning solving an additional task of grouping patients\nwithin the same sex and age group, more informative features are learned. This\nwork suggests the potential of using multiple metadata sources in supervised\ncontrastive settings, in particular in settings with class imbalance and few\ndata. Our code is released at https://github.com/ilyassmoummad/scl_icbhi2017\n","authors":["Ilyass Moummad","Nicolas Farrugia"],"pdf_url":"https://arxiv.org/pdf/2210.16192v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07831v3","updated":"2023-08-11T11:50:02Z","published":"2022-03-15T12:40:10Z","title":"Graph Neural Network Sensitivity Under Probabilistic Error Model","summary":"  Graph convolutional networks (GCNs) can successfully learn the graph signal\nrepresentation by graph convolution. The graph convolution depends on the graph\nfilter, which contains the topological dependency of data and propagates data\nfeatures. However, the estimation errors in the propagation matrix (e.g., the\nadjacency matrix) can have a significant impact on graph filters and GCNs. In\nthis paper, we study the effect of a probabilistic graph error model on the\nperformance of the GCNs. We prove that the adjacency matrix under the error\nmodel is bounded by a function of graph size and error probability. We further\nanalytically specify the upper bound of a normalized adjacency matrix with\nself-loop added. Finally, we illustrate the error bounds by running experiments\non a synthetic dataset and study the sensitivity of a simple GCN under this\nprobabilistic error model on accuracy.\n","authors":["Xinjue Wang","Esa Ollila","Sergiy A. Vorobyov"],"pdf_url":"https://arxiv.org/pdf/2203.07831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06069v1","updated":"2023-08-11T11:09:06Z","published":"2023-08-11T11:09:06Z","title":"Safeguarding Learning-based Control for Smart Energy Systems with\n  Sampling Specifications","summary":"  We study challenges using reinforcement learning in controlling energy\nsystems, where apart from performance requirements, one has additional safety\nrequirements such as avoiding blackouts. We detail how these safety\nrequirements in real-time temporal logic can be strengthened via discretization\ninto linear temporal logic (LTL), such that the satisfaction of the LTL\nformulae implies the satisfaction of the original safety requirements. The\ndiscretization enables advanced engineering methods such as synthesizing\nshields for safe reinforcement learning as well as formal verification, where\nfor statistical model checking, the probabilistic guarantee acquired by LTL\nmodel checking forms a lower bound for the satisfaction of the original\nreal-time safety requirements.\n","authors":["Chih-Hong Cheng","Venkatesh Prasad Venkataramanan","Pragya Kirti Gupta","Yun-Fei Hsu","Simon Burton"],"pdf_url":"https://arxiv.org/pdf/2308.06069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15157v3","updated":"2023-08-11T11:06:09Z","published":"2022-06-30T09:40:05Z","title":"HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object\n  Detection","summary":"  Besides standard cameras, autonomous vehicles typically include multiple\nadditional sensors, such as lidars and radars, which help acquire richer\ninformation for perceiving the content of the driving scene. While several\nrecent works focus on fusing certain pairs of sensors - such as camera with\nlidar or radar - by using architectural components specific to the examined\nsetting, a generic and modular sensor fusion architecture is missing from the\nliterature. In this work, we propose HRFuser, a modular architecture for\nmulti-modal 2D object detection. It fuses multiple sensors in a\nmulti-resolution fashion and scales to an arbitrary number of input modalities.\nThe design of HRFuser is based on state-of-the-art high-resolution networks for\nimage-only dense prediction and incorporates a novel multi-window\ncross-attention block as the means to perform fusion of multiple modalities at\nmultiple resolutions. We demonstrate via extensive experiments on nuScenes and\nthe adverse conditions DENSE datasets that our model effectively leverages\ncomplementary features from additional modalities, substantially improving upon\ncamera-only performance and consistently outperforming state-of-the-art 3D and\n2D fusion methods evaluated on 2D object detection metrics. The source code is\npublicly available.\n","authors":["Tim Broedermann","Christos Sakaridis","Dengxin Dai","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2206.15157v3.pdf","comment":"IEEE International Conference on Intelligent Transportation Systems\n  (ITSC) 2023"},{"id":"http://arxiv.org/abs/2305.07500v2","updated":"2023-08-11T10:49:52Z","published":"2023-05-12T14:14:39Z","title":"Learning representations that are closed-form Monge mapping optimal with\n  application to domain adaptation","summary":"  Optimal transport (OT) is a powerful geometric tool used to compare and align\nprobability measures following the least effort principle. Despite its\nwidespread use in machine learning (ML), OT problem still bears its\ncomputational burden, while at the same time suffering from the curse of\ndimensionality for measures supported on general high-dimensional spaces. In\nthis paper, we propose to tackle these challenges using representation\nlearning. In particular, we seek to learn an embedding space such that the\nsamples of the two input measures become alignable in it with a simple affine\nmapping that can be calculated efficiently in closed-form. We then show that\nsuch approach leads to results that are comparable to solving the original OT\nproblem when applied to the transfer learning task on which many OT baselines\nwhere previously evaluated in both homogeneous and heterogeneous DA settings.\nThe code for our contribution is available at\n\\url{https://github.com/Oleffa/LaOT}.\n","authors":["Oliver Struckmeier","Ievgen Redko","Anton Mallasto","Karol Arndt","Markus Heinonen","Ville Kyrki"],"pdf_url":"https://arxiv.org/pdf/2305.07500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07306v6","updated":"2023-08-11T10:46:29Z","published":"2021-06-14T11:23:59Z","title":"Constraining Linear-chain CRFs to Regular Languages","summary":"  A major challenge in structured prediction is to represent the\ninterdependencies within output structures. When outputs are structured as\nsequences, linear-chain conditional random fields (CRFs) are a widely used\nmodel class which can learn \\textit{local} dependencies in the output. However,\nthe CRF's Markov assumption makes it impossible for CRFs to represent\ndistributions with \\textit{nonlocal} dependencies, and standard CRFs are unable\nto respect nonlocal constraints of the data (such as global arity constraints\non output labels). We present a generalization of CRFs that can enforce a broad\nclass of constraints, including nonlocal ones, by specifying the space of\npossible output structures as a regular language $\\mathcal{L}$. The resulting\nregular-constrained CRF (RegCCRF) has the same formal properties as a standard\nCRF, but assigns zero probability to all label sequences not in $\\mathcal{L}$.\nNotably, RegCCRFs can incorporate their constraints during training, while\nrelated models only enforce constraints during decoding. We prove that\nconstrained training is never worse than constrained decoding, and show\nempirically that it can be substantially better in practice. Additionally, we\ndemonstrate a practical benefit on downstream tasks by incorporating a RegCCRF\ninto a deep neural model for semantic role labeling, exceeding state-of-the-art\nresults on a standard dataset.\n","authors":["Sean Papay","Roman Klinger","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2106.07306v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06058v1","updated":"2023-08-11T10:17:29Z","published":"2023-08-11T10:17:29Z","title":"Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence\n  and Variance Reduction","summary":"  The recently proposed stochastic Polyak stepsize (SPS) and stochastic\nline-search (SLS) for SGD have shown remarkable effectiveness when training\nover-parameterized models. However, in non-interpolation settings, both\nalgorithms only guarantee convergence to a neighborhood of a solution which may\nresult in a worse output than the initial guess. While artificially decreasing\nthe adaptive stepsize has been proposed to address this issue (Orvieto et al.\n[2022]), this approach results in slower convergence rates for convex and\nover-parameterized models. In this work, we make two contributions: Firstly, we\npropose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which\nguarantee convergence in non-interpolation settings and maintain sub-linear and\nlinear convergence rates for convex and strongly convex functions when training\nover-parameterized models. AdaSLS requires no knowledge of problem-dependent\nparameters, and AdaSPS requires only a lower bound of the optimal function\nvalue as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance\nreduction technique and obtain algorithms that require\n$\\smash{\\widetilde{\\mathcal{O}}}(n+1/\\epsilon)$ gradient evaluations to achieve\nan $\\mathcal{O}(\\epsilon)$-suboptimality for convex functions, which improves\nupon the slower $\\mathcal{O}(1/\\epsilon^2)$ rates of AdaSPS and AdaSLS without\nvariance reduction in the non-interpolation regimes. Moreover, our result\nmatches the fast rates of AdaSVRG but removes the inner-outer-loop structure,\nwhich is easier to implement and analyze. Finally, numerical experiments on\nsynthetic and real datasets validate our theory and demonstrate the\neffectiveness and robustness of our algorithms.\n","authors":["Xiaowen Jiang","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2308.06058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15939v2","updated":"2023-08-11T10:11:25Z","published":"2023-03-28T12:52:40Z","title":"Generating artificial digital image correlation data using\n  physics-guided adversarial networks","summary":"  Digital image correlation (DIC) has become a valuable tool in the evaluation\nof mechanical experiments, particularly fatigue crack growth experiments. The\nevaluation requires accurate information of the crack path and crack tip\nposition, which is difficult to obtain due to inherent noise and artefacts.\nMachine learning models have been extremely successful in recognizing this\nrelevant information. But for the training of robust models, which generalize\nwell, big data is needed. However, data is typically scarce in the field of\nmaterial science and engineering because experiments are expensive and\ntime-consuming. We present a method to generate synthetic DIC data using\ngenerative adversarial networks with a physics-guided discriminator. To decide\nwhether data samples are real or fake, this discriminator additionally receives\nthe derived von Mises equivalent strain. We show that this physics-guided\napproach leads to improved results in terms of visual quality of samples,\nsliced Wasserstein distance, and geometry score.\n","authors":["David Melching","Erik Schultheis","Eric Breitbarth"],"pdf_url":"https://arxiv.org/pdf/2303.15939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06053v1","updated":"2023-08-11T10:05:53Z","published":"2023-08-11T10:05:53Z","title":"Cost-effective On-device Continual Learning over Memory Hierarchy with\n  Miro","summary":"  Continual learning (CL) trains NN models incrementally from a continuous\nstream of tasks. To remember previously learned knowledge, prior studies store\nold samples over a memory hierarchy and replay them when new tasks arrive. Edge\ndevices that adopt CL to preserve data privacy are typically energy-sensitive\nand thus require high model accuracy while not compromising energy efficiency,\ni.e., cost-effectiveness. Our work is the first to explore the design space of\nhierarchical memory replay-based CL to gain insights into achieving\ncost-effectiveness on edge devices. We present Miro, a novel system runtime\nthat carefully integrates our insights into the CL framework by enabling it to\ndynamically configure the CL system based on resource states for the best\ncost-effectiveness. To reach this goal, Miro also performs online profiling on\nparameters with clear accuracy-energy trade-offs and adapts to optimal values\nwith low overhead. Extensive evaluations show that Miro significantly\noutperforms baseline systems we build for comparison, consistently achieving\nhigher cost-effectiveness.\n","authors":["Xinyue Ma","Suyeon Jeong","Minjia Zhang","Di Wang","Jonghyun Choi","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2308.06053v1.pdf","comment":"This paper is submitted for publication to MobiCom 2023"},{"id":"http://arxiv.org/abs/2303.16618v2","updated":"2023-08-11T10:01:35Z","published":"2023-03-29T12:19:23Z","title":"Personalised Language Modelling of Screen Characters Using Rich Metadata\n  Annotations","summary":"  Language models that are sensitive to external context can more effectively\ncapture the speaking patterns of individuals with specific characteristics or\nin particular environments. However, obtaining and leveraging such annotations\ncan be challenging. In this work, we show how to leverage rich character and\nfilm annotations to personalise language models in a scalable manner. Our best\nmodel can reduce perplexity by up to 6.5% compared to a parameter-matched\nlanguage model. Our approach performs on par with speaker-specific fine-tuning\nwhen the fine-tuning data (i.e. past dialogue) for individual speakers is\navailable. On top of that, it also generalises well to a scenario with no such\ndata, relying on combinations of demographic characteristics expressed via\nmetadata. Our findings are consistent across two corpora, one of which is also\na contribution of this paper: Cornell-rich contains rich manual annotations for\n863 speaking characters from the Cornell Movie Dialog Corpus, including\nfeatures such as characteristic quotes and character descriptions, along with\nsix automatically extracted metadata features for over 95% of the featured\nfilms. Finally, we also present a cost-benefit analysis highlighting which\nannotations are most cost-effective in reducing perplexity.\n","authors":["Sebastian Vincent","Rowanne Sumner","Alice Dowek","Charlotte Blundell","Emily Preston","Chris Bayliss","Chris Oakley","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2303.16618v2.pdf","comment":"9 pages; 4 figures; 6 tables. Preprint"},{"id":"http://arxiv.org/abs/2303.08032v2","updated":"2023-08-11T09:59:07Z","published":"2023-03-14T16:11:47Z","title":"Verifying the Robustness of Automatic Credibility Assessment","summary":"  Text classification methods have been widely investigated as a way to detect\ncontent of low credibility: fake news, social media bots, propaganda, etc.\nQuite accurate models (likely based on deep neural networks) help in moderating\npublic electronic platforms and often cause content creators to face rejection\nof their submissions or removal of already published texts. Having the\nincentive to evade further detection, content creators try to come up with a\nslightly modified version of the text (known as an attack with an adversarial\nexample) that exploit the weaknesses of classifiers and result in a different\noutput. Here we systematically test the robustness of popular text classifiers\nagainst available attacking techniques and discover that, indeed, in some cases\ninsignificant changes in input text can mislead the models. We also introduce\nBODEGA: a benchmark for testing both victim models and attack methods on four\nmisinformation detection tasks in an evaluation framework designed to simulate\nreal use-cases of content moderation. Finally, we manually analyse a subset\nadversarial examples and check what kinds of modifications are used in\nsuccessful attacks. The BODEGA code and data is openly shared in hope of\nenhancing the comparability and replicability of further research in this area\n","authors":["Piotr Przybyła","Alexander Shvets","Horacio Saggion"],"pdf_url":"https://arxiv.org/pdf/2303.08032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06051v1","updated":"2023-08-11T09:58:47Z","published":"2023-08-11T09:58:47Z","title":"Towards Instance-adaptive Inference for Federated Learning","summary":"  Federated learning (FL) is a distributed learning paradigm that enables\nmultiple clients to learn a powerful global model by aggregating local\ntraining. However, the performance of the global model is often hampered by\nnon-i.i.d. distribution among the clients, requiring extensive efforts to\nmitigate inter-client data heterogeneity. Going beyond inter-client data\nheterogeneity, we note that intra-client heterogeneity can also be observed on\ncomplex real-world data and seriously deteriorate FL performance. In this\npaper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client\ndata heterogeneity by enabling instance-adaptive inference in the FL framework.\nInstead of huge instance-adaptive models, we resort to a parameter-efficient\nfine-tuning method, i.e., scale and shift deep features (SSF), upon a\npre-trained model. Specifically, we first train an SSF pool for each client,\nand aggregate these SSF pools on the server side, thus still maintaining a low\ncommunication cost. To enable instance-adaptive inference, for a given\ninstance, we dynamically find the best-matched SSF subsets from the pool and\naggregate them to generate an adaptive SSF specified for the instance, thereby\nreducing the intra-client as well as the inter-client heterogeneity. Extensive\nexperiments show that our FedIns outperforms state-of-the-art FL algorithms,\ne.g., a 6.64\\% improvement against the top-performing method with less than\n15\\% communication cost on Tiny-ImageNet. Our code and models will be publicly\nreleased.\n","authors":["Chun-Mei Feng","Kai Yu","Nian Liu","Xinxing Xu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03063v2","updated":"2023-08-11T09:31:23Z","published":"2022-08-05T09:36:55Z","title":"Enhancing the Robustness via Adversarial Learning and Joint\n  Spatial-Temporal Embeddings in Traffic Forecasting","summary":"  Traffic forecasting is an essential problem in urban planning and computing.\nThe complex dynamic spatial-temporal dependencies among traffic objects (e.g.,\nsensors and road segments) have been calling for highly flexible models;\nunfortunately, sophisticated models may suffer from poor robustness especially\nin capturing the trend of the time series (1st-order derivatives with time),\nleading to unrealistic forecasts. To address the challenge of balancing\ndynamics and robustness, we propose TrendGCN, a new scheme that extends the\nflexibility of GCNs and the distribution-preserving capacity of generative and\nadversarial loss for handling sequential data with inherent statistical\ncorrelations. On the one hand, our model simultaneously incorporates spatial\n(node-wise) embeddings and temporal (time-wise) embeddings to account for\nheterogeneous space-and-time convolutions; on the other hand, it uses GAN\nstructure to systematically evaluate statistical consistencies between the real\nand the predicted time series in terms of both the temporal trending and the\ncomplex spatial-temporal dependencies. Compared with traditional approaches\nthat handle step-wise predictive errors independently, our approach can produce\nmore realistic and robust forecasts. Experiments on six benchmark traffic\nforecasting datasets and theoretical analysis both demonstrate the superiority\nand the state-of-the-art performance of TrendGCN. Source code is available at\nhttps://github.com/juyongjiang/TrendGCN.\n","authors":["Juyong Jiang","Binqing Wu","Ling Chen","Kai Zhang","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2208.03063v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2204.08247v3","updated":"2023-08-11T09:30:41Z","published":"2022-04-18T10:50:03Z","title":"Joint Multi-view Unsupervised Feature Selection and Graph Learning","summary":"  Despite significant progress, previous multi-view unsupervised feature\nselection methods mostly suffer from two limitations. First, they generally\nutilize either cluster structure or similarity structure to guide the feature\nselection, which neglect the possibility of a joint formulation with mutual\nbenefits. Second, they often learn the similarity structure by either global\nstructure learning or local structure learning, which lack the capability of\ngraph learning with both global and local structural awareness. In light of\nthis, this paper presents a joint multi-view unsupervised feature selection and\ngraph learning (JMVFG) approach. Particularly, we formulate the multi-view\nfeature selection with orthogonal decomposition, where each target matrix is\ndecomposed into a view-specific basis matrix and a view-consistent cluster\nindicator. The cross-space locality preservation is incorporated to bridge the\ncluster structure learning in the projected space and the similarity learning\n(i.e., graph learning) in the original space. Further, a unified objective\nfunction is presented to enable the simultaneous learning of the cluster\nstructure, the global and local similarity structures, and the multi-view\nconsistency and inconsistency, upon which an alternating optimization algorithm\nis developed with theoretically proved convergence. Extensive experiments on a\nvariety of real-world multi-view datasets demonstrate the superiority of our\napproach for both the multi-view feature selection and graph learning tasks.\nThe code is available at https://github.com/huangdonghere/JMVFG.\n","authors":["Si-Guo Fang","Dong Huang","Chang-Dong Wang","Yong Tang"],"pdf_url":"https://arxiv.org/pdf/2204.08247v3.pdf","comment":"To appear in IEEE Transactions on Emerging Topics in Computational\n  Intelligence"},{"id":"http://arxiv.org/abs/2305.02640v4","updated":"2023-08-11T09:30:08Z","published":"2023-05-04T08:20:37Z","title":"Towards Causal Representation Learning and Deconfounding from Indefinite\n  Data","summary":"  Owing to the cross-pollination between causal discovery and deep learning,\nnon-statistical data (e.g., images, text, etc.) encounters significant\nconflicts in terms of properties and methods with traditional causal data. To\nunify these data types of varying forms, we redefine causal data from two novel\nperspectives and then propose three data paradigms. Among them, the indefinite\ndata (like dialogues or video sources) induce low sample utilization and\nincapability of the distribution assumption, both leading to the fact that\nlearning causal representation from indefinite data is, as of yet, largely\nunexplored. We design the causal strength variational model to settle down\nthese two problems. Specifically, we leverage the causal strength instead of\nindependent noise as the latent variable to construct evidence lower bound. By\nthis design ethos, The causal strengths of different structures are regarded as\na distribution and can be expressed as a 2D matrix. Moreover, considering the\nlatent confounders, we disentangle the causal graph G into two relation\nsubgraphs O and C. O contains pure relations between observed variables, while\nC represents the relations from latent variables to observed variables. We\nimplement the above designs as a dynamic variational inference model, tailored\nto learn causal representation from indefinite data under latent confounding.\nFinally, we conduct comprehensive experiments on synthetic and real-world data\nto demonstrate the effectiveness of our method.\n","authors":["Hang Chen","Xinyu Yang","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.02640v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01558v2","updated":"2023-08-11T09:20:55Z","published":"2022-04-04T15:05:45Z","title":"Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning\n  Consistent and Contrastive Feature Representations","summary":"  In this work, we present Con$^{2}$DA, a simple framework that extends recent\nadvances in semi-supervised learning to the semi-supervised domain adaptation\n(SSDA) problem. Our framework generates pairs of associated samples by\nperforming stochastic data transformations to a given input. Associated data\npairs are mapped to a feature representation space using a feature extractor.\nWe use different loss functions to enforce consistency between the feature\nrepresentations of associated data pairs of samples. We show that these learned\nrepresentations are useful to deal with differences in data distributions in\nthe domain adaptation problem. We performed experiments to study the main\ncomponents of our model and we show that (i) learning of the consistent and\ncontrastive feature representations is crucial to extract good discriminative\nfeatures across different domains, and ii) our model benefits from the use of\nstrong augmentation policies. With these findings, our method achieves\nstate-of-the-art performances in three benchmark datasets for SSDA.\n","authors":["Manuel Pérez-Carrasco","Pavlos Protopapas","Guillermo Cabrera-Vives"],"pdf_url":"https://arxiv.org/pdf/2204.01558v2.pdf","comment":"Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting\n  Methods and Applications"},{"id":"http://arxiv.org/abs/2206.04531v3","updated":"2023-08-11T09:11:59Z","published":"2022-06-09T14:25:23Z","title":"ECLAD: Extracting Concepts with Local Aggregated Descriptors","summary":"  Convolutional neural networks (CNNs) are increasingly being used in critical\nsystems, where robustness and alignment are crucial. In this context, the field\nof explainable artificial intelligence has proposed the generation of\nhigh-level explanations of the prediction process of CNNs through concept\nextraction. While these methods can detect whether or not a concept is present\nin an image, they are unable to determine its location. What is more, a fair\ncomparison of such approaches is difficult due to a lack of proper validation\nprocedures. To address these issues, we propose a novel method for automatic\nconcept extraction and localization based on representations obtained through\npixel-wise aggregations of CNN activation maps. Further, we introduce a process\nfor the validation of concept-extraction techniques based on synthetic datasets\nwith pixel-wise annotations of their main components, reducing the need for\nhuman intervention. Extensive experimentation on both synthetic and real-world\ndatasets demonstrates that our method outperforms state-of-the-art\nalternatives.\n","authors":["Andres Felipe Posada-Moreno","Nikita Surya","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2206.04531v3.pdf","comment":"34 pages, under review"},{"id":"http://arxiv.org/abs/2205.13104v3","updated":"2023-08-11T09:09:45Z","published":"2022-05-26T01:54:48Z","title":"Trainable Weight Averaging: A General Approach for Subspace Training","summary":"  Training deep neural networks (DNNs) in low-dimensional subspaces is a\npromising direction for achieving efficient training and better generalization\nperformance. Our previous work extracts the subspaces by performing the\ndimension reduction method over the training trajectory, which verifies that\nDNN could be well-trained in a tiny subspace. However, that method is\ninefficient for subspace extraction and numerically unstable, limiting its\napplicability to more general tasks. In this paper, we connect subspace\ntraining to weight averaging and propose \\emph{Trainable Weight Averaging}\n(TWA), a general approach for subspace training. TWA is efficient in terms of\nsubspace extraction and easy to use, making it a promising new optimizer for\nDNN's training. Our design also includes an efficient scheme that allows\nparallel training across multiple nodes to handle large-scale problems and\nevenly distribute the memory and computation burden to each node. TWA can be\nused for both efficient training and generalization enhancement, for different\nneural network architectures, and for various tasks from image classification\nand object detection, to neural language processing. The code of implementation\nis available at https://github.com/nblt/TWA, which includes extensive\nexperiments covering various benchmark computer vision and neural language\nprocessing tasks with various architectures.\n","authors":["Tao Li","Zhehao Huang","Yingwen Wu","Zhengbao He","Qinghua Tao","Xiaolin Huang","Chih-Jen Lin"],"pdf_url":"https://arxiv.org/pdf/2205.13104v3.pdf","comment":"Journal version in progress. Previously accepted to ICLR 2023"},{"id":"http://arxiv.org/abs/2211.12461v2","updated":"2023-08-11T09:09:10Z","published":"2022-11-22T18:19:10Z","title":"A Neural-Network-Based Convex Regularizer for Image Reconstruction","summary":"  The emergence of deep-learning-based methods to solve image-reconstruction\nproblems has enabled a significant increase in reconstruction quality.\nUnfortunately, these new methods often lack reliability and explainability, and\nthere is a growing interest to address these shortcomings while retaining the\nboost in performance. In this work, we tackle this issue by revisiting\nregularizers that are the sum of convex-ridge functions. The gradient of such\nregularizers is parameterized by a neural network that has a single hidden\nlayer with increasing and learnable activation functions. This neural network\nis trained within a few minutes as a multistep Gaussian denoiser. The numerical\nexperiments for denoising, CT, and MRI reconstruction show improvements over\nmethods that offer similar reliability guarantees.\n","authors":["Alexis Goujon","Sebastian Neumayer","Pakshal Bohra","Stanislas Ducotterd","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2211.12461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06025v1","updated":"2023-08-11T09:07:38Z","published":"2023-08-11T09:07:38Z","title":"Controlling Character Motions without Observable Driving Source","summary":"  How to generate diverse, life-like, and unlimited long head/body sequences\nwithout any driving source? We argue that this under-investigated research\nproblem is non-trivial at all, and has unique technical challenges behind it.\nWithout semantic constraints from the driving sources, using the standard\nautoregressive model to generate infinitely long sequences would easily result\nin 1) out-of-distribution (OOD) issue due to the accumulated error, 2)\ninsufficient diversity to produce natural and life-like motion sequences and 3)\nundesired periodic patterns along the time. To tackle the above challenges, we\npropose a systematic framework that marries the benefits of VQ-VAE and a novel\ntoken-level control policy trained with reinforcement learning using carefully\ndesigned reward functions. A high-level prior model can be easily injected on\ntop to generate unlimited long and diverse sequences. Although we focus on no\ndriving sources now, our framework can be generalized for controlled synthesis\nwith explicit driving sources. Through comprehensive evaluations, we conclude\nthat our proposed framework can address all the above-mentioned challenges and\noutperform other strong baselines very significantly.\n","authors":["Weiyuan Li","Bin Dai","Ziyi Zhou","Qi Yao","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06013v1","updated":"2023-08-11T08:41:00Z","published":"2023-08-11T08:41:00Z","title":"Large Language Models for Telecom: Forthcoming Impact on the Industry","summary":"  Large Language Models (LLMs) have emerged as a transformative force,\nrevolutionizing numerous fields well beyond the conventional domain of Natural\nLanguage Processing (NLP) and garnering unprecedented attention. As LLM\ntechnology continues to progress, the telecom industry is facing the prospect\nof its potential impact on its landscape. To elucidate these implications, we\ndelve into the inner workings of LLMs, providing insights into their current\ncapabilities and limitations. We also examine the use cases that can be readily\nimplemented in the telecom industry, streamlining numerous tasks that currently\nhinder operational efficiency and demand significant manpower and engineering\nexpertise. Furthermore, we uncover essential research directions that deal with\nthe distinctive challenges of utilizing the LLMs within the telecom domain.\nAddressing these challenges represents a significant stride towards fully\nharnessing the potential of LLMs and unlocking their capabilities to the\nfullest extent within the telecom domain.\n","authors":["Ali Maatouk","Nicola Piovesan","Fadhel Ayed","Antonio De Domenico","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.06013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17271v2","updated":"2023-08-11T08:35:06Z","published":"2023-05-26T21:36:08Z","title":"Robust Lane Detection through Self Pre-training with Masked Sequential\n  Autoencoders and Fine-tuning with Customized PolyLoss","summary":"  Lane detection is crucial for vehicle localization which makes it the\nfoundation for automated driving and many intelligent and advanced driving\nassistant systems. Available vision-based lane detection methods do not make\nfull use of the valuable features and aggregate contextual information,\nespecially the interrelationships between lane lines and other regions of the\nimages in continuous frames. To fill this research gap and upgrade lane\ndetection performance, this paper proposes a pipeline consisting of self\npre-training with masked sequential autoencoders and fine-tuning with\ncustomized PolyLoss for the end-to-end neural network models using\nmulti-continuous image frames. The masked sequential autoencoders are adopted\nto pre-train the neural network models with reconstructing the missing pixels\nfrom a random masked image as the objective. Then, in the fine-tuning\nsegmentation phase where lane detection segmentation is performed, the\ncontinuous image frames are served as the inputs, and the pre-trained model\nweights are transferred and further updated using the backpropagation mechanism\nwith customized PolyLoss calculating the weighted errors between the output\nlane detection results and the labeled ground truth. Extensive experiment\nresults demonstrate that, with the proposed pipeline, the lane detection model\nperformance on both normal and challenging scenes can be advanced beyond the\nstate-of-the-art, delivering the best testing accuracy (98.38%), precision\n(0.937), and F1-measure (0.924) on the normal scene testing set, together with\nthe best overall accuracy (98.36%) and precision (0.844) in the challenging\nscene test set, while the training time can be substantially shortened.\n","authors":["Ruohan Li","Yongqi Dong"],"pdf_url":"https://arxiv.org/pdf/2305.17271v2.pdf","comment":"12 pages, 8 figures, accepted by journal of IEEE Transactions on\n  Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2011.07089v3","updated":"2023-08-11T08:30:55Z","published":"2020-11-13T19:04:24Z","title":"Robust Quadruped Jumping via Deep Reinforcement Learning","summary":"  In this paper, we consider a general task of jumping varying distances and\nheights for a quadrupedal robot in noisy environments, such as off of uneven\nterrain and with variable robot dynamics parameters. To accurately jump in such\nconditions, we propose a framework using deep reinforcement learning that\nleverages and augments the complex solution of nonlinear trajectory\noptimization for quadrupedal jumping. While the standalone optimization limits\njumping to take-off from flat ground and requires accurate assumptions of robot\ndynamics, our proposed approach improves the robustness to allow jumping off of\nsignificantly uneven terrain with variable robot dynamical parameters and\nenvironmental conditions. Compared with walking and running, the realization of\naggressive jumping on hardware necessitates accounting for the motors'\ntorque-speed relationship as well as the robot's total power limits. By\nincorporating these constraints into our learning framework, we successfully\ndeploy our policy sim-to-real without further tuning, fully exploiting the\navailable onboard power supply and motors. We demonstrate robustness to\nenvironment noise of foot disturbances of up to 6 cm in height, or 33% of the\nrobot's nominal standing height, while jumping 2x the body length in distance.\n","authors":["Guillaume Bellegarda","Chuong Nguyen","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2011.07089v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03045v2","updated":"2023-08-11T08:22:07Z","published":"2023-08-06T08:14:35Z","title":"Machine learning methods for the search for L&T brown dwarfs in the data\n  of modern sky surveys","summary":"  According to various estimates, brown dwarfs (BD) should account for up to 25\npercent of all objects in the Galaxy. However, few of them are discovered and\nwell-studied, both individually and as a population. Homogeneous and complete\nsamples of brown dwarfs are needed for these kinds of studies. Due to their\nweakness, spectral studies of brown dwarfs are rather laborious. For this\nreason, creating a significant reliable sample of brown dwarfs, confirmed by\nspectroscopic observations, seems unattainable at the moment. Numerous attempts\nhave been made to search for and create a set of brown dwarfs using their\ncolours as a decision rule applied to a vast amount of survey data. In this\nwork, we use machine learning methods such as Random Forest Classifier,\nXGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to\ndistinguish L and T brown dwarfs from objects of other spectral and luminosity\nclasses. The explanation of the models is discussed. We also compare our models\nwith classical decision rules, proving their efficiency and relevance.\n","authors":["Aleksandra Avdeeva"],"pdf_url":"https://arxiv.org/pdf/2308.03045v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05999v1","updated":"2023-08-11T08:06:58Z","published":"2023-08-11T08:06:58Z","title":"Does AI for science need another ImageNet Or totally different\n  benchmarks? A case study of machine learning force fields","summary":"  AI for science (AI4S) is an emerging research field that aims to enhance the\naccuracy and speed of scientific computing tasks using machine learning\nmethods. Traditional AI benchmarking methods struggle to adapt to the unique\nchallenges posed by AI4S because they assume data in training, testing, and\nfuture real-world queries are independent and identically distributed, while\nAI4S workloads anticipate out-of-distribution problem instances. This paper\ninvestigates the need for a novel approach to effectively benchmark AI for\nscience, using the machine learning force field (MLFF) as a case study. MLFF is\na method to accelerate molecular dynamics (MD) simulation with low\ncomputational cost and high accuracy. We identify various missed opportunities\nin scientifically meaningful benchmarking and propose solutions to evaluate\nMLFF models, specifically in the aspects of sample efficiency, time domain\nsensitivity, and cross-dataset generalization capabilities. By setting up the\nproblem instantiation similar to the actual scientific applications, more\nmeaningful performance metrics from the benchmark can be achieved. This suite\nof metrics has demonstrated a better ability to assess a model's performance in\nreal-world scientific applications, in contrast to traditional AI benchmarking\nmethodologies. This work is a component of the SAIBench project, an AI4S\nbenchmarking suite. The project homepage is\nhttps://www.computercouncil.org/SAIBench.\n","authors":["Yatao Li","Wanling Gao","Lei Wang","Lixin Sun","Zun Wang","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2308.05999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05481v2","updated":"2023-08-11T07:55:19Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":"  Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05986v1","updated":"2023-08-11T07:50:40Z","published":"2023-08-11T07:50:40Z","title":"Fast and Accurate Transferability Measurement by Evaluating Intra-class\n  Feature Variance","summary":"  Given a set of pre-trained models, how can we quickly and accurately find the\nmost useful pre-trained model for a downstream task? Transferability\nmeasurement is to quantify how transferable is a pre-trained model learned on a\nsource task to a target task. It is used for quickly ranking pre-trained models\nfor a given task and thus becomes a crucial step for transfer learning.\nExisting methods measure transferability as the discrimination ability of a\nsource model for a target data before transfer learning, which cannot\naccurately estimate the fine-tuning performance. Some of them restrict the\napplication of transferability measurement in selecting the best supervised\npre-trained models that have classifiers. It is important to have a general\nmethod for measuring transferability that can be applied in a variety of\nsituations, such as selecting the best self-supervised pre-trained models that\ndo not have classifiers, and selecting the best transferring layer for a target\ntask. In this work, we propose TMI (TRANSFERABILITY MEASUREMENT WITH\nINTRA-CLASS FEATURE VARIANCE), a fast and accurate algorithm to measure\ntransferability. We view transferability as the generalization of a pre-trained\nmodel on a target task by measuring intra-class feature variance. Intra-class\nvariance evaluates the adaptability of the model to a new task, which measures\nhow transferable the model is. Compared to previous studies that estimate how\ndiscriminative the models are, intra-class variance is more accurate than those\nas it does not require an optimal feature extractor and classifier. Extensive\nexperiments on real-world datasets show that TMI outperforms competitors for\nselecting the top-5 best models, and exhibits consistently better correlation\nin 13 out of 17 cases.\n","authors":["Huiwen Xu","U Kang"],"pdf_url":"https://arxiv.org/pdf/2308.05986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01118v2","updated":"2023-08-11T07:43:27Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":"  Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14150v2","updated":"2023-08-11T07:42:04Z","published":"2022-12-29T02:11:19Z","title":"A Dynamics Theory of Implicit Regularization in Deep Low-Rank Matrix\n  Factorization","summary":"  Implicit regularization is an important way to interpret neural networks.\nRecent theory starts to explain implicit regularization with the model of deep\nmatrix factorization (DMF) and analyze the trajectory of discrete gradient\ndynamics in the optimization process. These discrete gradient dynamics are\nrelatively small but not infinitesimal, thus fitting well with the practical\nimplementation of neural networks. Currently, discrete gradient dynamics\nanalysis has been successfully applied to shallow networks but encounters the\ndifficulty of complex computation for deep networks. In this work, we introduce\nanother discrete gradient dynamics approach to explain implicit regularization,\ni.e. landscape analysis. It mainly focuses on gradient regions, such as saddle\npoints and local minima. We theoretically establish the connection between\nsaddle point escaping (SPE) stages and the matrix rank in DMF. We prove that,\nfor a rank-R matrix reconstruction, DMF will converge to a second-order\ncritical point after R stages of SPE. This conclusion is further experimentally\nverified on a low-rank matrix reconstruction problem. This work provides a new\ntheory to analyze implicit regularization in deep learning.\n","authors":["Jian Cao","Chen Qian","Yihui Huang","Dicheng Chen","Yuncheng Gao","Jiyang Dong","Di Guo","Xiaobo Qu"],"pdf_url":"https://arxiv.org/pdf/2212.14150v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05969v1","updated":"2023-08-11T07:07:21Z","published":"2023-08-11T07:07:21Z","title":"Learning nonparametric DAGs with incremental information via high-order\n  HSIC","summary":"  Score-based methods for learning Bayesain networks(BN) aim to maximizing the\nglobal score functions. However, if local variables have direct and indirect\ndependence simultaneously, the global optimization on score functions misses\nedges between variables with indirect dependent relationship, of which scores\nare smaller than those with direct dependent relationship. In this paper, we\npresent an identifiability condition based on a determined subset of parents to\nidentify the underlying DAG. By the identifiability condition, we develop a\ntwo-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the\nglobal optimization. In the optimal phase, an optimization problem based on\nfirst-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated\nskeleton as the initial determined parents subset. In the tuning phase, the\nskeleton is locally tuned by deletion, addition and DAG-formalization\nstrategies using the theoretically proved incremental properties of high-order\nHSIC. Numerical experiments for different synthetic datasets and real-world\ndatasets show that the OT algorithm outperforms existing methods. Especially in\nSigmoid Mix model with the size of the graph being ${\\rm\\bf d=40}$, the\nstructure intervention distance (SID) of the OT algorithm is 329.7 smaller than\nthe one obtained by CAM, which indicates that the graph estimated by the OT\nalgorithm misses fewer edges compared with CAM.\n","authors":["Yafei Wang","Jianguo Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05959v1","updated":"2023-08-11T06:28:19Z","published":"2023-08-11T06:28:19Z","title":"Learned Point Cloud Compression for Classification","summary":"  Deep learning is increasingly being used to perform machine vision tasks such\nas classification, object detection, and segmentation on 3D point cloud data.\nHowever, deep learning inference is computationally expensive. The limited\ncomputational capabilities of end devices thus necessitate a codec for\ntransmitting point cloud data over the network for server-side processing. Such\na codec must be lightweight and capable of achieving high compression ratios\nwithout sacrificing accuracy. Motivated by this, we present a novel point cloud\ncodec that is highly specialized for the machine task of classification. Our\ncodec, based on PointNet, achieves a significantly better rate-accuracy\ntrade-off in comparison to alternative methods. In particular, it achieves a\n94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40\ndataset. For low-resource end devices, we also propose two lightweight\nconfigurations of our encoder that achieve similar BD-bitrate reductions of 93%\nand 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and\n0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the\npotential of specialized codecs for machine analysis of point clouds, and\nprovides a basis for extension to more complex tasks and datasets in the\nfuture.\n","authors":["Mateen Ulhaq","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2308.05959v1.pdf","comment":"6 pages, 4 figures, IEEE MMSP 2023"},{"id":"http://arxiv.org/abs/2308.05957v1","updated":"2023-08-11T06:19:23Z","published":"2023-08-11T06:19:23Z","title":"Node Embedding for Homophilous Graphs with ARGEW: Augmentation of Random\n  walks by Graph Edge Weights","summary":"  Representing nodes in a network as dense vectors node embeddings is important\nfor understanding a given network and solving many downstream tasks. In\nparticular, for weighted homophilous graphs where similar nodes are connected\nwith larger edge weights, we desire node embeddings where node pairs with\nstrong weights have closer embeddings. Although random walk based node\nembedding methods like node2vec and node2vec+ do work for weighted networks via\nincluding edge weights in the walk transition probabilities, our experiments\nshow that the embedding result does not adequately reflect edge weights. In\nthis paper, we propose ARGEW (Augmentation of Random walks by Graph Edge\nWeights), a novel augmentation method for random walks that expands the corpus\nin such a way that nodes with larger edge weights end up with closer\nembeddings. ARGEW can work with any random walk based node embedding method,\nbecause it is independent of the random sampling strategy itself and works on\ntop of the already-performed walks. With several real-world networks, we\ndemonstrate that with ARGEW, compared to not using it, the desired pattern that\nnode pairs with larger edge weights have closer embeddings is much clearer. We\nalso examine ARGEW's performance in node classification: node2vec with ARGEW\noutperforms pure node2vec and is not sensitive to hyperparameters (i.e.\nconsistently good). In fact, it achieves similarly good results as supervised\nGCN, even without any node feature or label information during training.\nFinally, we explain why ARGEW works consistently well by exploring the\ncoappearance distributions using a synthetic graph with clear structural roles.\n","authors":["Jun Hee Kim","Jaeman Son","Hyunsoo Kim","Eunjo Lee"],"pdf_url":"https://arxiv.org/pdf/2308.05957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01097v3","updated":"2023-08-11T05:30:10Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04902v2","updated":"2023-08-11T05:27:05Z","published":"2023-06-08T03:09:49Z","title":"A Cover Time Study of a non-Markovian Algorithm","summary":"  Given a traversal algorithm, cover time is the expected number of steps\nneeded to visit all nodes in a given graph. A smaller cover time means a higher\nexploration efficiency of traversal algorithm. Although random walk algorithms\nhave been studied extensively in the existing literature, there has been no\ncover time result for any non-Markovian method. In this work, we stand on a\ntheoretical perspective and show that the negative feedback strategy (a\ncount-based exploration method) is better than the naive random walk search. In\nparticular, the former strategy can locally improve the search efficiency for\nan arbitrary graph. It also achieves smaller cover times for special but\nimportant graphs, including clique graphs, tree graphs, etc. Moreover, we make\nconnections between our results and reinforcement learning literature to give\nnew insights on why classical UCB and MCTS algorithms are so useful. Various\nnumerical results corroborate our theoretical findings.\n","authors":["Guanhua Fang","Gennady Samorodnitsky","Zhiqiang Xu"],"pdf_url":"https://arxiv.org/pdf/2306.04902v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.00284v2","updated":"2023-08-11T04:43:16Z","published":"2023-08-01T04:46:35Z","title":"CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability\n  in Visual Clustering","summary":"  Visual clustering is a common perceptual task in scatterplots that supports\ndiverse analytics tasks (e.g., cluster identification). However, even with the\nsame scatterplot, the ways of perceiving clusters (i.e., conducting visual\nclustering) can differ due to the differences among individuals and ambiguous\ncluster boundaries. Although such perceptual variability casts doubt on the\nreliability of data analysis based on visual clustering, we lack a systematic\nway to efficiently assess this variability. In this research, we study\nperceptual variability in conducting visual clustering, which we call Cluster\nAmbiguity. To this end, we introduce CLAMS, a data-driven visual quality\nmeasure for automatically predicting cluster ambiguity in monochrome\nscatterplots. We first conduct a qualitative study to identify key factors that\naffect the visual separation of clusters (e.g., proximity or size difference\nbetween clusters). Based on study findings, we deploy a regression module that\nestimates the human-judged separability of two clusters. Then, CLAMS predicts\ncluster ambiguity by analyzing the aggregated results of all pairwise\nseparability between clusters that are generated by the module. CLAMS\noutperforms widely-used clustering techniques in predicting ground truth\ncluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human\nannotators. We conclude our work by presenting two applications for optimizing\nand benchmarking data mining techniques using CLAMS. The interactive demo of\nCLAMS is available at clusterambiguity.dev.\n","authors":["Hyeon Jeon","Ghulam Jilani Quadri","Hyunwook Lee","Paul Rosen","Danielle Albers Szafir","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00284v2.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n  (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani\n  Quadri"},{"id":"http://arxiv.org/abs/2308.00282v2","updated":"2023-08-11T04:39:33Z","published":"2023-08-01T04:38:15Z","title":"ZADU: A Python Library for Evaluating the Reliability of Dimensionality\n  Reduction Embeddings","summary":"  Dimensionality reduction (DR) techniques inherently distort the original\nstructure of input high-dimensional data, producing imperfect low-dimensional\nembeddings. Diverse distortion measures have thus been proposed to evaluate the\nreliability of DR embeddings. However, implementing and executing distortion\nmeasures in practice has so far been time-consuming and tedious. To address\nthis issue, we present ZADU, a Python library that provides distortion\nmeasures. ZADU is not only easy to install and execute but also enables\ncomprehensive evaluation of DR embeddings through three key features. First,\nthe library covers a wide range of distortion measures. Second, it\nautomatically optimizes the execution of distortion measures, substantially\nreducing the running time required to execute multiple measures. Last, the\nlibrary informs how individual points contribute to the overall distortions,\nfacilitating the detailed analysis of DR embeddings. By simulating a real-world\nscenario of optimizing DR embeddings, we verify that our optimization scheme\nsubstantially reduces the time required to execute distortion measures.\nFinally, as an application of ZADU, we present another library called ZADUVis\nthat allows users to easily create distortion visualizations that depict the\nextent to which each region of an embedding suffers from distortions.\n","authors":["Hyeon Jeon","Aeri Cho","Jinhwa Jang","Soohyun Lee","Jake Hyun","Hyung-Kwon Ko","Jaemin Jo","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00282v2.pdf","comment":"2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short\n  paper"},{"id":"http://arxiv.org/abs/2308.00278v2","updated":"2023-08-11T04:35:47Z","published":"2023-08-01T04:33:16Z","title":"Classes are not Clusters: Improving Label-based Evaluation of\n  Dimensionality Reduction","summary":"  A common way to evaluate the reliability of dimensionality reduction (DR)\nembeddings is to quantify how well labeled classes form compact, mutually\nseparated clusters in the embeddings. This approach is based on the assumption\nthat the classes stay as clear clusters in the original high-dimensional space.\nHowever, in reality, this assumption can be violated; a single class can be\nfragmented into multiple separated clusters, and multiple classes can be merged\ninto a single cluster. We thus cannot always assure the credibility of the\nevaluation using class labels. In this paper, we introduce two novel quality\nmeasures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing\nthe process of DR evaluation based on class labels. Instead of assuming that\nclasses are well-clustered in the original space, Label-T&C work by (1)\nestimating the extent to which classes form clusters in the original and\nembedded spaces and (2) evaluating the difference between the two. A\nquantitative evaluation showed that Label-T&C outperform widely used DR\nevaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler\ndivergence) in terms of the accuracy in assessing how well DR embeddings\npreserve the cluster structure, and are also scalable. Moreover, we present\ncase studies demonstrating that Label-T&C can be successfully used for\nrevealing the intrinsic characteristics of DR techniques and their\nhyperparameters.\n","authors":["Hyeon Jeon","Yun-Hsin Kuo","Michaël Aupetit","Kwan-Liu Ma","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00278v2.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n  (Proc. IEEE VIS 2023)"},{"id":"http://arxiv.org/abs/2307.15980v3","updated":"2023-08-11T04:32:04Z","published":"2023-07-29T13:02:45Z","title":"Initial State Interventions for Deconfounded Imitation Learning","summary":"  Imitation learning suffers from causal confusion. This phenomenon occurs when\nlearned policies attend to features that do not causally influence the expert\nactions but are instead spuriously correlated. Causally confused agents produce\nlow open-loop supervised loss but poor closed-loop performance upon deployment.\nWe consider the problem of masking observed confounders in a disentangled\nrepresentation of the observation space. Our novel masking algorithm leverages\nthe usual ability to intervene in the initial system state, avoiding any\nrequirement involving expert querying, expert reward functions, or causal graph\nspecification. Under certain assumptions, we theoretically prove that this\nalgorithm is conservative in the sense that it does not incorrectly mask\nobservations that causally influence the expert; furthermore, intervening on\nthe initial state serves to strictly reduce excess conservatism. The masking\nalgorithm is applied to behavior cloning for two illustrative control systems:\nCartPole and Reacher.\n","authors":["Samuel Pfrommer","Yatong Bai","Hyunin Lee","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2307.15980v3.pdf","comment":"62nd IEEE Conference on Decision and Control"},{"id":"http://arxiv.org/abs/2209.05483v2","updated":"2023-08-11T04:28:51Z","published":"2022-09-12T06:14:04Z","title":"Self-Supervised Coordinate Projection Network for Sparse-View Computed\n  Tomography","summary":"  In the present work, we propose a Self-supervised COordinate Projection\nnEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV\nsinogram by solving the inverse tomography imaging problem. Compared with\nrecent related works that solve similar problems using implicit neural\nrepresentation network (INR), our essential contribution is an effective and\nsimple re-projection strategy that pushes the tomography image reconstruction\nquality over supervised deep learning CT reconstruction works. The proposed\nstrategy is inspired by the simple relationship between linear algebra and\ninverse problems. To solve the under-determined linear equation system, we\nfirst introduce INR to constrain the solution space via image continuity prior\nand achieve a rough solution. And secondly, we propose to generate a dense view\nsinogram that improves the rank of the linear equation system and produces a\nmore stable CT image solution space. Our experiment results demonstrate that\nthe re-projection strategy significantly improves the image reconstruction\nquality (+3 dB for PSNR at least). Besides, we integrate the recent hash\nencoding into our SCOPE model, which greatly accelerates the model training.\nFinally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction\ntasks. Experimental results indicate that the proposed SCOPE model outperforms\ntwo latest INR-based methods and two well-popular supervised DL methods\nquantitatively and qualitatively.\n","authors":["Qing Wu","Ruimin Feng","Hongjiang Wei","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.05483v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05930v1","updated":"2023-08-11T04:24:39Z","published":"2023-08-11T04:24:39Z","title":"INR-Arch: A Dataflow Architecture and Compiler for Arbitrary-Order\n  Gradient Computations in Implicit Neural Representation Processing","summary":"  An increasing number of researchers are finding use for nth-order gradient\ncomputations for a wide variety of applications, including graphics,\nmeta-learning (MAML), scientific computing, and most recently, implicit neural\nrepresentations (INRs). Recent work shows that the gradient of an INR can be\nused to edit the data it represents directly without needing to convert it back\nto a discrete representation. However, given a function represented as a\ncomputation graph, traditional architectures face challenges in efficiently\ncomputing its nth-order gradient due to the higher demand for computing power\nand higher complexity in data movement. This makes it a promising target for\nFPGA acceleration. In this work, we introduce INR-Arch, a framework that\ntransforms the computation graph of an nth-order gradient into a\nhardware-optimized dataflow architecture. We address this problem in two\nphases. First, we design a dataflow architecture that uses FIFO streams and an\noptimized computation kernel library, ensuring high memory efficiency and\nparallel computation. Second, we propose a compiler that extracts and optimizes\ncomputation graphs, automatically configures hardware parameters such as\nlatency and stream depths to optimize throughput, while ensuring deadlock-free\noperation, and outputs High-Level Synthesis (HLS) code for FPGA implementation.\nWe utilize INR editing as our benchmark, presenting results that demonstrate\n1.8-4.8x and 1.5-3.6x speedup compared to CPU and GPU baselines respectively.\nFurthermore, we obtain 3.1-8.9x and 1.7-4.3x lower memory usage, and 1.7-11.3x\nand 5.5-32.8x lower energy-delay product. Our framework will be made\nopen-source and available on GitHub.\n","authors":["Stefan Abi-Karam","Rishov Sarkar","Dejia Xu","Zhiwen Fan","Zhangyang Wang","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2308.05930v1.pdf","comment":"9 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2301.13349v4","updated":"2023-08-11T04:20:33Z","published":"2023-01-31T00:52:14Z","title":"Unconstrained Dynamic Regret via Sparse Coding","summary":"  Motivated by the challenge of nonstationarity in sequential decision making,\nwe study Online Convex Optimization (OCO) under the coupling of two problem\nstructures: the domain is unbounded, and the comparator sequence\n$u_1,\\ldots,u_T$ is arbitrarily time-varying. As no algorithm can guarantee low\nregret simultaneously against all comparator sequences, handling this setting\nrequires moving from minimax optimality to comparator adaptivity. That is,\nsensible regret bounds should depend on certain complexity measures of the\ncomparator relative to one's prior knowledge.\n  This paper achieves a new type of these adaptive regret bounds via a sparse\ncoding framework. The complexity of the comparator is measured by its energy\nand its sparsity on a user-specified dictionary, which offers considerable\nversatility. Equipped with a wavelet dictionary for example, our framework\nimproves the state-of-the-art bound (Jacobsen & Cutkosky, 2022) by adapting to\nboth ($i$) the magnitude of the comparator average $||\\bar\nu||=||\\sum_{t=1}^Tu_t/T||$, rather than the maximum $\\max_t||u_t||$; and ($ii$)\nthe comparator variability $\\sum_{t=1}^T||u_t-\\bar u||$, rather than the\nuncentered sum $\\sum_{t=1}^T||u_t||$. Furthermore, our analysis is simpler due\nto decoupling function approximation from regret minimization.\n","authors":["Zhiyu Zhang","Ashok Cutkosky","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2301.13349v4.pdf","comment":"Small technical improvements + fixing typos"},{"id":"http://arxiv.org/abs/2306.15932v2","updated":"2023-08-11T04:17:11Z","published":"2023-06-28T05:33:11Z","title":"NIPD: A Federated Learning Person Detection Benchmark Based on\n  Real-World Non-IID Data","summary":"  Federated learning (FL), a privacy-preserving distributed machine learning,\nhas been rapidly applied in wireless communication networks. FL enables\nInternet of Things (IoT) clients to obtain well-trained models while preventing\nprivacy leakage. Person detection can be deployed on edge devices with limited\ncomputing power if combined with FL to process the video data directly at the\nedge. However, due to the different hardware and deployment scenarios of\ndifferent cameras, the data collected by the camera present non-independent and\nidentically distributed (non-IID), and the global model derived from FL\naggregation is less effective. Meanwhile, existing research lacks public data\nset for real-world FL object detection, which is not conducive to studying the\nnon-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person\ndetection (NIPD) data set, which is collected from five different cameras. To\nour knowledge, this is the first true device-based non-IID person detection\ndata set. Based on this data set, we explain how to establish a FL experimental\nplatform and provide a benchmark for non-IID person detection. NIPD is expected\nto promote the application of FL and the security of smart city.\n","authors":["Kangning Yin","Zhen Ding","Zhihua Dong","Dongsheng Chen","Jie Fu","Xinhui Ji","Guangqiang Yin","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15932v2.pdf","comment":"8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference"},{"id":"http://arxiv.org/abs/2306.14701v2","updated":"2023-08-11T03:39:46Z","published":"2023-06-26T13:47:38Z","title":"Hard Sample Mining Enabled Supervised Contrastive Feature Learning for\n  Wind Turbine Pitch System Fault Diagnosis","summary":"  The efficient utilization of wind power by wind turbines relies on the\nability of their pitch systems to adjust blade pitch angles in response to\nvarying wind speeds. However, the presence of multiple health conditions in the\npitch system due to the long-term wear and tear poses challenges in accurately\nclassifying them, thus increasing the maintenance cost of wind turbines or even\ndamaging them. This paper proposes a novel method based on hard sample\nmining-enabled supervised contrastive learning (HSMSCL) to address this\nproblem. The proposed method employs cosine similarity to identify hard samples\nand subsequently, leverages supervised contrastive learning to learn more\ndiscriminative representations by constructing hard sample pairs. Furthermore,\nthe hard sample mining framework in the proposed method also constructs hard\nsamples with learned representations to make the training process of the\nmultilayer perceptron (MLP) more challenging and make it a more effective\nclassifier. The proposed approach progressively improves the fault diagnosis\nmodel by introducing hard samples in the SCL and MLP phases, thus enhancing its\nperformance in complex multi-class fault diagnosis tasks.\n  To evaluate the effectiveness of the proposed method, two real datasets\ncomprising wind turbine pitch system cog belt fracture data are utilized. The\nfault diagnosis performance of the proposed method is compared against existing\nmethods, and the results demonstrate its superior performance. The proposed\napproach exhibits significant improvements in fault diagnosis performance,\nproviding promising prospects for enhancing the reliability and efficiency of\nwind turbine pitch system fault diagnosis.\n","authors":["Zixuan Wang","Bo Qin","Mengxuan Li","Chenlu Zhan","Mark D. Butala","Peng Peng","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14770v2","updated":"2023-08-11T03:20:07Z","published":"2023-02-28T17:11:42Z","title":"Completeness of Atomic Structure Representations","summary":"  In this paper, we address the challenge of obtaining a comprehensive and\nsymmetric representation of point particle groups, such as atoms in a molecule,\nwhich is crucial in physics and theoretical chemistry. The problem has become\neven more important with the widespread adoption of machine-learning techniques\nin science, as it underpins the capacity of models to accurately reproduce\nphysical relationships while being consistent with fundamental symmetries and\nconservation laws. However, the descriptors that are commonly used to represent\npoint clouds -- most notably those adopted to describe matter at the atomic\nscale -- are unable to distinguish between special arrangements of particles.\nThis makes it impossible to machine learn their properties. Frameworks that are\nprovably complete exist but are only so in the limit in which they\nsimultaneously describe the mutual relationship between all atoms, which is\nimpractical. We present a novel approach to construct descriptors of finite\ncorrelations based on the relative arrangement of particle triplets, which can\nbe employed to create symmetry-adapted models with universal approximation\ncapabilities. Our strategy is demonstrated on a class of atomic arrangements\nthat are specifically built to defy a broad class of conventional symmetric\ndescriptors, showcasing its potential for addressing their limitations.\n","authors":["Jigyasa Nigam","Sergey N. Pozdnyakov","Kevin K. Huguenin-Dumittan","Michele Ceriotti"],"pdf_url":"https://arxiv.org/pdf/2302.14770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10510v3","updated":"2023-08-11T03:07:28Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v3.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.05476v2","updated":"2023-08-11T02:50:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n  Deceptive Text Classification: A Comparative Analysis","summary":"  Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive o fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content.\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05906v1","updated":"2023-08-11T02:05:08Z","published":"2023-08-11T02:05:08Z","title":"On the equivalence of Occam algorithms","summary":"  Blumer et al. (1987, 1989) showed that any concept class that is learnable by\nOccam algorithms is PAC learnable. Board and Pitt (1990) showed a partial\nconverse of this theorem: for concept classes that are closed under exception\nlists, any class that is PAC learnable is learnable by an Occam algorithm.\nHowever, their Occam algorithm outputs a hypothesis whose complexity is\n$\\delta$-dependent, which is an important limitation. In this paper, we show\nthat their partial converse applies to Occam algorithms with\n$\\delta$-independent complexities as well. Thus, we provide a posteriori\njustification of various theoretical results and algorithm design methods which\nuse the partial converse as a basis for their work.\n","authors":["Zaman Keinath-Esmail"],"pdf_url":"https://arxiv.org/pdf/2308.05906v1.pdf","comment":"13 pages, submitted to Information and Computation"},{"id":"http://arxiv.org/abs/2306.09927v2","updated":"2023-08-11T02:04:46Z","published":"2023-06-16T15:50:03Z","title":"Trained Transformers Learn Linear Models In-Context","summary":"  Attention-based neural networks such as transformers have demonstrated a\nremarkable ability to exhibit in-context learning (ICL): Given a short prompt\nsequence of tokens from an unseen task, they can formulate relevant per-token\nand next-token predictions without any parameter updates. By embedding a\nsequence of labeled training data and unlabeled test data as a prompt, this\nallows for transformers to behave like supervised learning algorithms. Indeed,\nrecent work has shown that when training transformer architectures over random\ninstances of linear regression problems, these models' predictions mimic those\nof ordinary least squares.\n  Towards understanding the mechanisms underlying this phenomenon, we\ninvestigate the dynamics of ICL in transformers with a single linear\nself-attention layer trained by gradient flow on linear regression tasks. We\nshow that despite non-convexity, gradient flow with a suitable random\ninitialization finds a global minimum of the objective function. At this global\nminimum, when given a test prompt of labeled examples from a new prediction\ntask, the transformer achieves prediction error competitive with the best\nlinear predictor over the test prompt distribution. We additionally\ncharacterize the robustness of the trained transformer to a variety of\ndistribution shifts and show that although a number of shifts are tolerated,\nshifts in the covariate distribution of the prompts are not. Motivated by this,\nwe consider a generalized ICL setting where the covariate distributions can\nvary across prompts. We show that although gradient flow succeeds at finding a\nglobal minimum in this setting, the trained transformer is still brittle under\nmild covariate shifts. We complement this finding with experiments on large,\nnonlinear transformer architectures which we show are more robust under\ncovariate shifts.\n","authors":["Ruiqi Zhang","Spencer Frei","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2306.09927v2.pdf","comment":"50 pages, experiments added, reference added, typo corrected"},{"id":"http://arxiv.org/abs/2308.05903v1","updated":"2023-08-11T01:55:14Z","published":"2023-08-11T01:55:14Z","title":"Comparing the quality of neural network uncertainty estimates for\n  classification problems","summary":"  Traditional deep learning (DL) models are powerful classifiers, but many\napproaches do not provide uncertainties for their estimates. Uncertainty\nquantification (UQ) methods for DL models have received increased attention in\nthe literature due to their usefulness in decision making, particularly for\nhigh-consequence decisions. However, there has been little research done on how\nto evaluate the quality of such methods. We use statistical methods of\nfrequentist interval coverage and interval width to evaluate the quality of\ncredible intervals, and expected calibration error to evaluate classification\npredicted confidence. These metrics are evaluated on Bayesian neural networks\n(BNN) fit using Markov Chain Monte Carlo (MCMC) and variational inference (VI),\nbootstrapped neural networks (NN), Deep Ensembles (DE), and Monte Carlo (MC)\ndropout. We apply these different UQ for DL methods to a hyperspectral image\ntarget detection problem and show the inconsistency of the different methods'\nresults and the necessity of a UQ quality metric. To reconcile these\ndifferences and choose a UQ method that appropriately quantifies the\nuncertainty, we create a simulated data set with fully parameterized\nprobability distribution for a two-class classification problem. The gold\nstandard MCMC performs the best overall, and the bootstrapped NN is a close\nsecond, requiring the same computational expense as DE. Through this\ncomparison, we demonstrate that, for a given data set, different models can\nproduce uncertainty estimates of markedly different quality. This in turn\npoints to a great need for principled assessment methods of UQ quality in DL\napplications.\n","authors":["Daniel Ries","Joshua Michalenko","Tyler Ganter","Rashad Imad-Fayez Baiyasi","Jason Adams"],"pdf_url":"https://arxiv.org/pdf/2308.05903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.11750v3","updated":"2023-08-11T01:45:43Z","published":"2020-10-22T14:14:20Z","title":"Precise High-Dimensional Asymptotics for Quantifying Heterogeneous\n  Transfers","summary":"  The problem of learning one task with samples from another task has received\nmuch interest recently. In this paper, we ask a fundamental question: when is\ncombining data from two tasks better than learning one task alone? Intuitively,\nthe transfer effect from one task to another task depends on dataset shifts\nsuch as sample sizes and covariance matrices. However, quantifying such a\ntransfer effect is challenging since we need to compare the risks between joint\nlearning and single-task learning, and the comparative advantage of one over\nthe other depends on the exact kind of dataset shift between both tasks. This\npaper uses random matrix theory to tackle this challenge in a linear regression\nsetting with two tasks. We give precise asymptotics about the excess risks of\nsome commonly used estimators in the high-dimensional regime, when the sample\nsizes increase proportionally with the feature dimension at fixed ratios. The\nprecise asymptotics is provided as a function of the sample sizes and\ncovariate/model shifts, which can be used to study transfer effects: In a\nrandom-effects model, we give conditions to determine positive and negative\ntransfers between learning two tasks versus single-task learning; the\nconditions reveal intricate relations between dataset shifts and transfer\neffects. Simulations justify the validity of the asymptotics in finite\ndimensions. Our analysis examines several functions of two different sample\ncovariance matrices, revealing some estimates that generalize classical results\nin the random matrix theory literature, which may be of independent interest.\n","authors":["Fan Yang","Hongyang R. Zhang","Sen Wu","Christopher Ré","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2010.11750v3.pdf","comment":"64 pages, 6 figures; We thoroughly revised the paper by adding new\n  results and reorganizing the presentation"},{"id":"http://arxiv.org/abs/2308.05309v2","updated":"2023-08-11T01:09:34Z","published":"2023-08-10T02:53:30Z","title":"Homophily-enhanced Structure Learning for Graph Clustering","summary":"  Graph clustering is a fundamental task in graph analysis, and recent advances\nin utilizing graph neural networks (GNNs) have shown impressive results.\nDespite the success of existing GNN-based graph clustering methods, they often\noverlook the quality of graph structure, which is inherent in real-world graphs\ndue to their sparse and multifarious nature, leading to subpar performance.\nGraph structure learning allows refining the input graph by adding missing\nlinks and removing spurious connections. However, previous endeavors in graph\nstructure learning have predominantly centered around supervised settings, and\ncannot be directly applied to our specific clustering tasks due to the absence\nof ground-truth labels. To bridge the gap, we propose a novel method called\n\\textbf{ho}mophily-enhanced structure \\textbf{le}arning for graph clustering\n(HoLe). Our motivation stems from the observation that subtly enhancing the\ndegree of homophily within the graph structure can significantly improve GNNs\nand clustering outcomes. To realize this objective, we develop two\nclustering-oriented structure learning modules, i.e., hierarchical correlation\nestimation and cluster-aware sparsification. The former module enables a more\naccurate estimation of pairwise node relationships by leveraging guidance from\nlatent and clustering spaces, while the latter one generates a sparsified\nstructure based on the similarity matrix and clustering assignments.\nAdditionally, we devise a joint optimization approach alternating between\ntraining the homophily-enhanced structure learning and GNN-based clustering,\nthereby enforcing their reciprocal effects. Extensive experiments on seven\nbenchmark datasets of various types and scales, across a range of clustering\nmetrics, demonstrate the superiority of HoLe against state-of-the-art\nbaselines.\n","authors":["Ming Gu","Gaoming Yang","Sheng Zhou","Ning Ma","Jiawei Chen","Qiaoyu Tan","Meihan Liu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2308.05309v2.pdf","comment":"11 pages with 7 figures. Accepted by CIKM'23"},{"id":"http://arxiv.org/abs/2308.05893v1","updated":"2023-08-11T00:59:29Z","published":"2023-08-11T00:59:29Z","title":"Learning to Team-Based Navigation: A Review of Deep Reinforcement\n  Learning Techniques for Multi-Agent Pathfinding","summary":"  Multi-agent pathfinding (MAPF) is a critical field in many large-scale\nrobotic applications, often being the fundamental step in multi-agent systems.\nThe increasing complexity of MAPF in complex and crowded environments, however,\ncritically diminishes the effectiveness of existing solutions. In contrast to\nother studies that have either presented a general overview of the recent\nadvancements in MAPF or extensively reviewed Deep Reinforcement Learning (DRL)\nwithin multi-agent system settings independently, our work presented in this\nreview paper focuses on highlighting the integration of DRL-based approaches in\nMAPF. Moreover, we aim to bridge the current gap in evaluating MAPF solutions\nby addressing the lack of unified evaluation metrics and providing\ncomprehensive clarification on these metrics. Finally, our paper discusses the\npotential of model-based DRL as a promising future direction and provides its\nrequired foundational understanding to address current challenges in MAPF. Our\nobjective is to assist readers in gaining insight into the current research\ndirection, providing unified metrics for comparing different MAPF algorithms\nand expanding their knowledge of model-based DRL to address the existing\nchallenges in MAPF.\n","authors":["Jaehoon Chung","Jamil Fayyad","Younes Al Younes","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2308.05893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17020v2","updated":"2023-08-11T00:47:30Z","published":"2022-10-31T02:25:38Z","title":"A Law of Data Separation in Deep Learning","summary":"  While deep learning has enabled significant advances in many areas of\nscience, its black-box nature hinders architecture design for future artificial\nintelligence applications and interpretation for high-stakes decision makings.\nWe addressed this issue by studying the fundamental question of how deep neural\nnetworks process data in the intermediate layers. Our finding is a simple and\nquantitative law that governs how deep neural networks separate data according\nto class membership throughout all layers for classification. This law shows\nthat each layer improves data separation at a constant geometric rate, and its\nemergence is observed in a collection of network architectures and datasets\nduring training. This law offers practical guidelines for designing\narchitectures, improving model robustness and out-of-sample performance, as\nwell as interpreting the predictions.\n","authors":["Hangfeng He","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2210.17020v2.pdf","comment":"Accepted at PNAS"},{"id":"http://arxiv.org/abs/2308.05889v1","updated":"2023-08-11T00:44:46Z","published":"2023-08-11T00:44:46Z","title":"DF2: Distribution-Free Decision-Focused Learning","summary":"  Decision-focused learning (DFL) has recently emerged as a powerful approach\nfor predict-then-optimize problems by customizing a predictive model to a\ndownstream optimization task. However, existing end-to-end DFL methods are\nhindered by three significant bottlenecks: model mismatch error, sample average\napproximation error, and gradient approximation error. Model mismatch error\nstems from the misalignment between the model's parameterized predictive\ndistribution and the true probability distribution. Sample average\napproximation error arises when using finite samples to approximate the\nexpected optimization objective. Gradient approximation error occurs as DFL\nrelies on the KKT condition for exact gradient computation, while most methods\napproximate the gradient for backpropagation in non-convex objectives. In this\npaper, we present DF2 -- the first \\textit{distribution-free} decision-focused\nlearning method explicitly designed to address these three bottlenecks. Rather\nthan depending on a task-specific forecaster that requires precise model\nassumptions, our method directly learns the expected optimization function\nduring training. To efficiently learn the function in a data-driven manner, we\ndevise an attention-based model architecture inspired by the distribution-based\nparameterization of the expected objective. Our method is, to the best of our\nknowledge, the first to address all three bottlenecks within a single model. We\nevaluate DF2 on a synthetic problem, a wind power bidding problem, and a\nnon-convex vaccine distribution problem, demonstrating the effectiveness of\nDF2.\n","authors":["Lingkai Kong","Wenhao Mu","Jiaming Cui","Yuchen Zhuang","B. Aditya Prakash","Bo Dai","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05889v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2201.07646v4","updated":"2023-08-11T00:13:54Z","published":"2022-01-19T15:23:46Z","title":"A Survey on Training Challenges in Generative Adversarial Networks for\n  Biomedical Image Analysis","summary":"  In biomedical image analysis, the applicability of deep learning methods is\ndirectly impacted by the quantity of image data available. This is due to deep\nlearning models requiring large image datasets to provide high-level\nperformance. Generative Adversarial Networks (GANs) have been widely utilized\nto address data limitations through the generation of synthetic biomedical\nimages. GANs consist of two models. The generator, a model that learns how to\nproduce synthetic images based on the feedback it receives. The discriminator,\na model that classifies an image as synthetic or real and provides feedback to\nthe generator. Throughout the training process, a GAN can experience several\ntechnical challenges that impede the generation of suitable synthetic imagery.\nFirst, the mode collapse problem whereby the generator either produces an\nidentical image or produces a uniform image from distinct input features.\nSecond, the non-convergence problem whereby the gradient descent optimizer\nfails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem\nwhereby unstable training behavior occurs due to the discriminator achieving\noptimal classification performance resulting in no meaningful feedback being\nprovided to the generator. These problems result in the production of synthetic\nimagery that is blurry, unrealistic, and less diverse. To date, there has been\nno survey article outlining the impact of these technical challenges in the\ncontext of the biomedical imagery domain. This work presents a review and\ntaxonomy based on solutions to the training problems of GANs in the biomedical\nimaging domain. This survey highlights important challenges and outlines future\nresearch directions about the training of GANs in the domain of biomedical\nimagery.\n","authors":["Muhammad Muneeb Saad","Ruairi O'Reilly","Mubashir Husain Rehmani"],"pdf_url":"https://arxiv.org/pdf/2201.07646v4.pdf","comment":"Submitted to the AI Review Journal"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.04522v2","updated":"2023-08-11T15:39:03Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":"  Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06087v1","updated":"2023-08-11T11:57:58Z","published":"2023-08-11T11:57:58Z","title":"Audio-Visual Spatial Integration and Recursive Attention for Robust\n  Sound Source Localization","summary":"  The objective of the sound source localization task is to enable machines to\ndetect the location of sound-making objects within a visual scene. While the\naudio modality provides spatial cues to locate the sound source, existing\napproaches only use audio as an auxiliary role to compare spatial regions of\nthe visual modality. Humans, on the other hand, utilize both audio and visual\nmodalities as spatial cues to locate sound sources. In this paper, we propose\nan audio-visual spatial integration network that integrates spatial cues from\nboth modalities to mimic human behavior when detecting sound-making objects.\nAdditionally, we introduce a recursive attention network to mimic human\nbehavior of iterative focusing on objects, resulting in more accurate attention\nregions. To effectively encode spatial information from both modalities, we\npropose audio-visual pair matching loss and spatial region alignment loss. By\nutilizing the spatial cues of audio-visual modalities and recursively focusing\nobjects, our method can perform more robust sound source localization.\nComprehensive experimental results on the Flickr SoundNet and VGG-Sound Source\ndatasets demonstrate the superiority of our proposed method over existing\napproaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL\n","authors":["Sung Jin Um","Dongjin Kim","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2308.06087v1.pdf","comment":"Camera-Ready, ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06076v1","updated":"2023-08-11T11:29:01Z","published":"2023-08-11T11:29:01Z","title":"Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD\n  Space","summary":"  Creating realistic 3D facial animation is crucial for various applications in\nthe movie production and gaming industry, especially with the burgeoning demand\nin the metaverse. However, prevalent methods such as blendshape-based\napproaches and facial rigging techniques are time-consuming, labor-intensive,\nand lack standardized configurations, making facial animation production\nchallenging and costly. In this paper, we propose a novel self-supervised\nframework, Versatile Face Animator, which combines facial motion capture with\nmotion retargeting in an end-to-end manner, eliminating the need for\nblendshapes or rigs. Our method has the following two main characteristics: 1)\nwe propose an RGBD animation module to learn facial motion from raw RGBD videos\nby hierarchical motion dictionaries and animate RGBD images rendered from 3D\nfacial mesh coarse-to-fine, enabling facial animation on arbitrary 3D\ncharacters regardless of their topology, textures, blendshapes, and rigs; and\n2) we introduce a mesh retarget module to utilize RGBD animation to create 3D\nfacial animation by manipulating facial mesh with controller transformations,\nwhich are estimated from dense optical flow fields and blended together with\ngeodesic-distance-based weights. Comprehensive experiments demonstrate the\neffectiveness of our proposed framework in generating impressive 3D facial\nanimation results, highlighting its potential as a promising solution for the\ncost-effective and efficient production of facial animation in the metaverse.\n","authors":["Haoyu Wang","Haozhe Wu","Junliang Xing","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2308.06076v1.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2306.02898v3","updated":"2023-08-11T11:13:08Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n  Multi-Attribute and Language Search Benchmark","summary":"  In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06009v1","updated":"2023-08-11T08:30:08Z","published":"2023-08-11T08:30:08Z","title":"ViGT: Proposal-free Video Grounding with Learnable Token in Transformer","summary":"  The video grounding (VG) task aims to locate the queried action or event in\nan untrimmed video based on rich linguistic descriptions. Existing\nproposal-free methods are trapped in complex interaction between video and\nquery, overemphasizing cross-modal feature fusion and feature correlation for\nVG. In this paper, we propose a novel boundary regression paradigm that\nperforms regression token learning in a transformer. Particularly, we present a\nsimple but effective proposal-free framework, namely Video Grounding\nTransformer (ViGT), which predicts the temporal boundary using a learnable\nregression token rather than multi-modal or cross-modal features. In ViGT, the\nbenefits of a learnable token are manifested as follows. (1) The token is\nunrelated to the video or the query and avoids data bias toward the original\nvideo and query. (2) The token simultaneously performs global context\naggregation from video and query features. First, we employed a sharing feature\nencoder to project both video and query into a joint feature space before\nperforming cross-modal co-attention (i.e., video-to-query attention and\nquery-to-video attention) to highlight discriminative features in each\nmodality. Furthermore, we concatenated a learnable regression token [REG] with\nthe video and query features as the input of a vision-language transformer.\nFinally, we utilized the token [REG] to predict the target moment and visual\nfeatures to constrain the foreground and background probabilities at each\ntimestamp. The proposed ViGT performed well on three public datasets: ANet\nCaptions, TACoS and YouCookII. Extensive ablation studies and qualitative\nanalysis further validated the interpretability of ViGT.\n","authors":["Kun Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06009v1.pdf","comment":"This paper has been accepted by SCIENCE CHINA Information Sciences"},{"id":"http://arxiv.org/abs/2308.05995v1","updated":"2023-08-11T08:03:28Z","published":"2023-08-11T08:03:28Z","title":"Audio is all in one: speech-driven gesture synthetics using WavLM\n  pre-trained model","summary":"  The generation of co-speech gestures for digital humans is an emerging area\nin the field of virtual human creation. Prior research has made progress by\nusing acoustic and semantic information as input and adopting classify method\nto identify the person's ID and emotion for driving co-speech gesture\ngeneration. However, this endeavour still faces significant challenges. These\nchallenges go beyond the intricate interplay between co-speech gestures, speech\nacoustic, and semantics; they also encompass the complexities associated with\npersonality, emotion, and other obscure but important factors. This paper\nintroduces \"diffmotion-v2,\" a speech-conditional diffusion-based and\nnon-autoregressive transformer-based generative model with WavLM pre-trained\nmodel. It can produce individual and stylized full-body co-speech gestures only\nusing raw speech audio, eliminating the need for complex multimodal processing\nand manually annotated. Firstly, considering that speech audio not only\ncontains acoustic and semantic features but also conveys personality traits,\nemotions, and more subtle information related to accompanying gestures, we\npioneer the adaptation of WavLM, a large-scale pre-trained model, to extract\nlow-level and high-level audio information. Secondly, we introduce an adaptive\nlayer norm architecture in the transformer-based layer to learn the\nrelationship between speech information and accompanying gestures. Extensive\nsubjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT\ndatasets to confirm the WavLM and the model's ability to synthesize natural\nco-speech gestures with various styles.\n","authors":["Fan Zhang","Naye Ji","Fuxing Gao","Siyuan Zhao","Zhaohan Wang","Shunman Li"],"pdf_url":"https://arxiv.org/pdf/2308.05995v1.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2306.08966v2","updated":"2023-08-11T04:55:40Z","published":"2023-06-15T09:01:33Z","title":"Training Multimedia Event Extraction With Generated Images and Captions","summary":"  Contemporary news reporting increasingly features multimedia content,\nmotivating research on multimedia event extraction. However, the task lacks\nannotated multimodal training data and artificially generated training data\nsuffer from distribution shift from real-world data. In this paper, we propose\nCross-modality Augmented Multimedia Event Learning (CAMEL), which successfully\nutilizes artificially generated multimodal training data and achieves\nstate-of-the-art performance. We start with two labeled unimodal datasets in\ntext and image respectively, and generate the missing modality using\noff-the-shelf image generators like Stable Diffusion and image captioners like\nBLIP. After that, we train the network on the resultant multimodal datasets. In\norder to learn robust features that are effective across domains, we devise an\niterative and gradual training strategy. Substantial experiments show that\nCAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On\nmultimedia events in particular, we outperform the prior SOTA by 4.2% F1 on\nevent mention identification and by 9.8% F1 on argument identification, which\nindicates that CAMEL learns synergistic representations from the two\nmodalities. Our work demonstrates a recipe to unleash the power of synthetic\ntraining data in structured prediction.\n","authors":["Zilin Du","Yunxin Li","Xu Guo","Yidan Sun","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.08966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05920v1","updated":"2023-08-11T03:07:31Z","published":"2023-08-11T03:07:31Z","title":"Semantics2Hands: Transferring Hand Motion Semantics between Avatars","summary":"  Human hands, the primary means of non-verbal communication, convey intricate\nsemantics in various scenarios. Due to the high sensitivity of individuals to\nhand motions, even minor errors in hand motions can significantly impact the\nuser experience. Real applications often involve multiple avatars with varying\nhand shapes, highlighting the importance of maintaining the intricate semantics\nof hand motions across the avatars. Therefore, this paper aims to transfer the\nhand motion semantics between diverse avatars based on their respective hand\nmodels. To address this problem, we introduce a novel anatomy-based semantic\nmatrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the\npositions of the palm and other joints relative to the local frame of the\ncorresponding joint, enabling precise retargeting of hand motions.\nSubsequently, we obtain a mapping function from the source ASM to the target\nhand joint rotations by employing an anatomy-based semantics reconstruction\nnetwork (ASRN). We train the ASRN using a semi-supervised learning strategy on\nthe Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain\nand cross-domain hand motion retargeting tasks. The qualitative and\nquantitative results demonstrate the significant superiority of our ASRN over\nthe state-of-the-arts.\n","authors":["Zijie Ye","Jia Jia","Junliang Xing"],"pdf_url":"https://arxiv.org/pdf/2308.05920v1.pdf","comment":"Accepted to MM 2023, 9 pages, 10 figures. Project page:\n  https://abcyzj.github.io/S2H/"},{"id":"http://arxiv.org/abs/2303.09695v2","updated":"2023-08-11T20:07:48Z","published":"2023-03-17T00:03:38Z","title":"PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point\n  Clouds","summary":"  Garment pattern design aims to convert a 3D garment to the corresponding 2D\npanels and their sewing structure. Existing methods rely either on template\nfitting with heuristics and prior assumptions, or on model learning with\ncomplicated shape parameterization. Importantly, both approaches do not allow\nfor personalization of the output garment, which today has increasing demands.\nTo fill this demand, we introduce PersonalTailor: a personalized 2D pattern\ndesign method, where the user can input specific constraints or demands (in\nlanguage or sketch) for personal 2D panel fabrication from 3D point clouds.\nPersonalTailor first learns a multi-modal panel embeddings based on\nunsupervised cross-modal association and attentive fusion. It then predicts a\nbinary panel masks individually using a transformer encoder-decoder framework.\nExtensive experiments show that our PersonalTailor excels on both personalized\nand standard pattern fabrication tasks.\n","authors":["Sauradip Nag","Anran Qi","Xiatian Zhu","Ariel Shamir"],"pdf_url":"https://arxiv.org/pdf/2303.09695v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2301.13617v2","updated":"2023-08-11T18:36:12Z","published":"2023-01-31T13:21:15Z","title":"A Closer Look into Recent Video-based Learning Research: A Comprehensive\n  Review of Video Characteristics, Tools, Technologies, and Learning\n  Effectiveness","summary":"  People increasingly use videos on the Web as a source for learning. To\nsupport this way of learning, researchers and developers are continuously\ndeveloping tools, proposing guidelines, analyzing data, and conducting\nexperiments. However, it is still not clear what characteristics a video should\nhave to be an effective learning medium. In this paper, we present a\ncomprehensive review of 257 articles on video-based learning for the period\nfrom 2016 to 2021. One of the aims of the review is to identify the video\ncharacteristics that have been explored by previous work. Based on our\nanalysis, we suggest a taxonomy which organizes the video characteristics and\ncontextual aspects into eight categories: (1) audio features, (2) visual\nfeatures, (3) textual features, (4) instructor behavior, (5) learners\nactivities, (6) interactive features (quizzes, etc.), (7) production style, and\n(8) instructional design. Also, we identify four representative research\ndirections: (1) proposals of tools to support video-based learning, (2) studies\nwith controlled experiments, (3) data analysis studies, and (4) proposals of\ndesign guidelines for learning videos. We find that the most explored\ncharacteristics are textual features followed by visual features, learner\nactivities, and interactive features. Text of transcripts, video frames, and\nimages (figures and illustrations) are most frequently used by tools that\nsupport learning through videos. The learner activity is heavily explored\nthrough log files in data analysis studies, and interactive features have been\nfrequently scrutinized in controlled experiments. We complement our review by\ncontrasting research findings that investigate the impact of video\ncharacteristics on the learning effectiveness, report on tasks and technologies\nused to develop tools that support learning, and summarize trends of design\nguidelines to produce learning videos\n","authors":["Evelyn Navarrete","Andreas Nehring","Sascha Schanze","Ralph Ewerth","Anett Hoppe"],"pdf_url":"https://arxiv.org/pdf/2301.13617v2.pdf","comment":null}]},"2023-08-14T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.07317v1","updated":"2023-08-14T17:59:56Z","published":"2023-08-14T17:59:56Z","title":"Platypus: Quick, Cheap, and Powerful Refinement of LLMs","summary":"  We present $\\textbf{Platypus}$, a family of fine-tuned and merged Large\nLanguage Models (LLMs) that achieves the strongest performance and currently\nstands at first place in HuggingFace's Open LLM Leaderboard as of the release\ndate of this work. In this work we describe (1) our curated dataset\n$\\textbf{Open-Platypus}$, that is a subset of other open datasets and which\n$\\textit{we release to the public}$ (2) our process of fine-tuning and merging\nLoRA modules in order to conserve the strong prior of pretrained LLMs, while\nbringing specific domain knowledge to the surface (3) our efforts in checking\nfor test data leaks and contamination in the training data, which can inform\nfuture research. Specifically, the Platypus family achieves strong performance\nin quantitative LLM metrics across model sizes, topping the global Open LLM\nleaderboard while using just a fraction of the fine-tuning data and overall\ncompute that are required for other state-of-the-art fine-tuned LLMs. In\nparticular, a 13B Platypus model can be trained on $\\textit{a single}$ A100 GPU\nusing 25k questions in 5 hours. This is a testament of the quality of our\nOpen-Platypus dataset, and opens opportunities for more improvements in the\nfield. Project page: https://platypus-llm.github.io\n","authors":["Ariel N. Lee","Cole J. Hunter","Nataniel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2308.07317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07308v1","updated":"2023-08-14T17:54:10Z","published":"2023-08-14T17:54:10Z","title":"LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked","summary":"  Large language models (LLMs) have skyrocketed in popularity in recent years\ndue to their ability to generate high-quality text in response to human\nprompting. However, these models have been shown to have the potential to\ngenerate harmful content in response to user prompting (e.g., giving users\ninstructions on how to commit crimes). There has been a focus in the literature\non mitigating these risks, through methods like aligning models with human\nvalues through reinforcement learning. However, it has been shown that even\naligned language models are susceptible to adversarial attacks that bypass\ntheir restrictions on generating harmful text. We propose a simple approach to\ndefending against these attacks by having a large language model filter its own\nresponses. Our current results show that even if a model is not fine-tuned to\nbe aligned with human values, it is possible to stop it from presenting harmful\ncontent to users by validating the content using a language model.\n","authors":["Alec Helbling","Mansi Phute","Matthew Hull","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2308.07308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07305v1","updated":"2023-08-14T17:46:52Z","published":"2023-08-14T17:46:52Z","title":"Neural Authorship Attribution: Stylometric Analysis on Large Language\n  Models","summary":"  Large language models (LLMs) such as GPT-4, PaLM, and Llama have\nsignificantly propelled the generation of AI-crafted text. With rising concerns\nabout their potential misuse, there is a pressing need for AI-generated-text\nforensics. Neural authorship attribution is a forensic effort, seeking to trace\nAI-generated text back to its originating LLM. The LLM landscape can be divided\ninto two primary categories: proprietary and open-source. In this work, we\ndelve into these emerging categories of LLMs, focusing on the nuances of neural\nauthorship attribution. To enrich our understanding, we carry out an empirical\nanalysis of LLM writing signatures, highlighting the contrasts between\nproprietary and open-source models, and scrutinizing variations within each\ngroup. By integrating stylometric features across lexical, syntactic, and\nstructural aspects of language, we explore their potential to yield\ninterpretable results and augment pre-trained language model-based classifiers\nutilized in neural authorship attribution. Our findings, based on a range of\nstate-of-the-art LLMs, provide empirical insights into neural authorship\nattribution, paving the way for future investigations aimed at mitigating the\nthreats posed by AI-generated misinformation.\n","authors":["Tharindu Kumarage","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07286v1","updated":"2023-08-14T17:17:21Z","published":"2023-08-14T17:17:21Z","title":"The Devil is in the Errors: Leveraging Large Language Models for\n  Fine-grained Machine Translation Evaluation","summary":"  Automatic evaluation of machine translation (MT) is a critical tool driving\nthe rapid iterative development of MT systems. While considerable progress has\nbeen made on estimating a single scalar quality score, current metrics lack the\ninformativeness of more detailed schemes that annotate individual errors, such\nas Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap\nby proposing AutoMQM, a prompting technique which leverages the reasoning and\nin-context learning capabilities of large language models (LLMs) and asks them\nto identify and categorize errors in translations. We start by evaluating\nrecent LLMs, such as PaLM and PaLM-2, through simple score prediction\nprompting, and we study the impact of labeled data through in-context learning\nand finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that\nit improves performance compared to just prompting for scores (with\nparticularly large gains for larger models) while providing interpretability\nthrough error spans that align with human annotations.\n","authors":["Patrick Fernandes","Daniel Deutsch","Mara Finkelstein","Parker Riley","André F. T. Martins","Graham Neubig","Ankush Garg","Jonathan H. Clark","Markus Freitag","Orhan Firat"],"pdf_url":"https://arxiv.org/pdf/2308.07286v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.07282v1","updated":"2023-08-14T17:12:43Z","published":"2023-08-14T17:12:43Z","title":"Comparison between parameter-efficient techniques and full fine-tuning:\n  A case study on multilingual news article classification","summary":"  Adapters and Low-Rank Adaptation (LoRA) are parameter-efficient fine-tuning\ntechniques designed to make the training of language models more efficient.\nPrevious results demonstrated that these methods can even improve performance\non some classification tasks. This paper complements the existing research by\ninvestigating how these techniques influence the classification performance and\ncomputation costs compared to full fine-tuning when applied to multilingual\ntext classification tasks (genre, framing, and persuasion techniques detection;\nwith different input lengths, number of predicted classes and classification\ndifficulty), some of which have limited training data. In addition, we conduct\nin-depth analyses of their efficacy across different training scenarios\n(training on the original multilingual data; on the translations into English;\nand on a subset of English-only data) and different languages. Our findings\nprovide valuable insights into the applicability of the parameter-efficient\nfine-tuning techniques, particularly to complex multilingual and multilabel\nclassification tasks.\n","authors":["Olesya Razuvayevskaya","Ben Wu","Joao A. Leite","Freddy Heppell","Ivan Srba","Carolina Scarton","Kalina Bontcheva","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.07282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07272v1","updated":"2023-08-14T16:58:50Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n  Optimization for Few-shot Learning","summary":"  Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n  Models","summary":"  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2208.08063v5","updated":"2023-08-14T16:49:07Z","published":"2022-08-17T04:30:58Z","title":"NECE: Narrative Event Chain Extraction Toolkit","summary":"  To understand a narrative, it is essential to comprehend the temporal event\nflows, especially those associated with main characters; however, this can be\nchallenging with lengthy and unstructured narrative texts. To address this, we\nintroduce NECE, an open-access, document-level toolkit that automatically\nextracts and aligns narrative events in the temporal order of their occurrence.\nThrough extensive evaluations, we show the high quality of the NECE toolkit and\ndemonstrates its downstream application in analyzing narrative bias regarding\ngender. We also openly discuss the shortcomings of the current approach, and\npotential of leveraging generative models in future works. Lastly the NECE\ntoolkit includes both a Python library and a user-friendly web interface, which\noffer equal access to professionals and layman audience alike, to visualize\nevent chain, obtain narrative flows, or study narrative bias.\n","authors":["Guangxuan Xu","Paulina Toro Isaza","Moshi Li","Akintoye Oloko","Bingsheng Yao","Cassia Sanctos","Aminat Adebiyi","Yufang Hou","Nanyun Peng","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2208.08063v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06259v2","updated":"2023-08-14T16:44:01Z","published":"2023-08-11T17:47:54Z","title":"Self-Alignment with Instruction Backtranslation","summary":"  We present a scalable method to build a high quality instruction following\nlanguage model by automatically labelling human-written text with corresponding\ninstructions. Our approach, named instruction backtranslation, starts with a\nlanguage model finetuned on a small amount of seed data, and a given web\ncorpus. The seed model is used to construct training examples by generating\ninstruction prompts for web documents (self-augmentation), and then selecting\nhigh quality examples from among these candidates (self-curation). This data is\nthen used to finetune a stronger model. Finetuning LLaMa on two iterations of\nour approach yields a model that outperforms all other LLaMa-based models on\nthe Alpaca leaderboard not relying on distillation data, demonstrating highly\neffective self-alignment.\n","authors":["Xian Li","Ping Yu","Chunting Zhou","Timo Schick","Luke Zettlemoyer","Omer Levy","Jason Weston","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2308.06259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16878v2","updated":"2023-08-14T16:40:31Z","published":"2023-07-31T17:41:10Z","title":"Contrastive Learning for API Aspect Analysis","summary":"  We present a novel approach - CLAA - for API aspect detection in API reviews\nthat utilizes transformer models trained with a supervised contrastive loss\nobjective function. We evaluate CLAA using performance and impact analysis. For\nperformance analysis, we utilized a benchmark dataset on developer discussions\ncollected from Stack Overflow and compare the results to those obtained using\nstate-of-the-art transformer models. Our experiments show that contrastive\nlearning can significantly improve the performance of transformer models in\ndetecting aspects such as Performance, Security, Usability, and Documentation.\nFor impact analysis, we performed empirical and developer study. On a randomly\nselected and manually labeled 200 online reviews, CLAA achieved 92% accuracy\nwhile the SOTA baseline achieved 81.5%. According to our developer study\ninvolving 10 participants, the use of 'Stack Overflow + CLAA' resulted in\nincreased accuracy and confidence during API selection. Replication package:\nhttps://github.com/disa-lab/Contrastive-Learning-API-Aspect-ASE2023\n","authors":["G. M. Shahariar","Tahmid Hasan","Anindya Iqbal","Gias Uddin"],"pdf_url":"https://arxiv.org/pdf/2307.16878v2.pdf","comment":"Accepted in the 38th IEEE/ACM International Conference on Automated\n  Software Engineering (ASE2023)"},{"id":"http://arxiv.org/abs/2308.07201v1","updated":"2023-08-14T15:13:04Z","published":"2023-08-14T15:13:04Z","title":"ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate","summary":"  Text evaluation has historically posed significant challenges, often\ndemanding substantial labor and time cost. With the emergence of large language\nmodels (LLMs), researchers have explored LLMs' potential as alternatives for\nhuman evaluation. While these single-agent-based approaches show promise,\nexperimental results suggest that further advancements are needed to bridge the\ngap between their current effectiveness and human-level evaluation quality.\nRecognizing that best practices of human evaluation processes often involve\nmultiple human annotators collaborating in the evaluation, we resort to a\nmulti-agent debate framework, moving beyond single-agent prompting strategies.\nThe multi-agent-based approach enables a group of LLMs to synergize with an\narray of intelligent counterparts, harnessing their distinct capabilities and\nexpertise to enhance efficiency and effectiveness in handling intricate tasks.\nIn this paper, we construct a multi-agent referee team called ChatEval to\nautonomously discuss and evaluate the quality of generated responses from\ndifferent models on open-ended questions and traditional natural language\ngeneration (NLG) tasks. Our analysis shows that ChatEval transcends mere\ntextual scoring, offering a human-mimicking evaluation process for reliable\nassessments. Our code is available at https://github.com/chanchimin/ChatEval.\n","authors":["Chi-Min Chan","Weize Chen","Yusheng Su","Jianxuan Yu","Wei Xue","Shanghang Zhang","Jie Fu","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07179v1","updated":"2023-08-14T14:39:02Z","published":"2023-08-14T14:39:02Z","title":"Incorporating Annotator Uncertainty into Representations of Discourse\n  Relations","summary":"  Annotation of discourse relations is a known difficult task, especially for\nnon-expert annotators. In this paper, we investigate novice annotators'\nuncertainty on the annotation of discourse relations on spoken conversational\ndata. We find that dialogue context (single turn, pair of turns within speaker,\nand pair of turns across speakers) is a significant predictor of confidence\nscores. We compute distributed representations of discourse relations from\nco-occurrence statistics that incorporate information about confidence scores\nand dialogue context. We perform a hierarchical clustering analysis using these\nrepresentations and show that weighting discourse relation representations with\ninformation about confidence and dialogue context coherently models our\nannotators' uncertainty about discourse relation labels.\n","authors":["S. Magalí López Cortez","Cassandra L. Jacobs"],"pdf_url":"https://arxiv.org/pdf/2308.07179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07124v1","updated":"2023-08-14T13:53:54Z","published":"2023-08-14T13:53:54Z","title":"OctoPack: Instruction Tuning Code Large Language Models","summary":"  Finetuning large language models (LLMs) on instructions leads to vast\nperformance improvements on natural language tasks. We apply instruction tuning\nusing code, leveraging the natural structure of Git commits, which pair code\nchanges with human instructions. We compile CommitPack: 4 terabytes of Git\ncommits across 350 programming languages. We benchmark CommitPack against other\nnatural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B\nparameter StarCoder model, and achieve state-of-the-art performance among\nmodels not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2%\npass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark\nto a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis)\nacross 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models,\nOctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among\nall permissive models, demonstrating CommitPack's benefits in generalizing to a\nwider set of languages and natural coding tasks. Code, models and data are\nfreely available at https://github.com/bigcode-project/octopack.\n","authors":["Niklas Muennighoff","Qian Liu","Armel Zebaze","Qinkai Zheng","Binyuan Hui","Terry Yue Zhuo","Swayam Singh","Xiangru Tang","Leandro von Werra","Shayne Longpre"],"pdf_url":"https://arxiv.org/pdf/2308.07124v1.pdf","comment":"57 pages (9 main), 39 figures, 16 tables"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":"  The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07120v1","updated":"2023-08-14T13:00:53Z","published":"2023-08-14T13:00:53Z","title":"Mind your Language (Model): Fact-Checking LLMs and their Role in NLP\n  Research and Practice","summary":"  Much of the recent discourse within the NLP research community has been\ncentered around Large Language Models (LLMs), their functionality and potential\n-- yet not only do we not have a working definition of LLMs, but much of this\ndiscourse relies on claims and assumptions that are worth re-examining. This\nposition paper contributes a definition of LLMs, explicates some of the\nassumptions made regarding their functionality, and outlines the existing\nevidence for and against them. We conclude with suggestions for research\ndirections and their framing in future work.\n","authors":["Alexandra Sasha Luccioni","Anna Rogers"],"pdf_url":"https://arxiv.org/pdf/2308.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v1","updated":"2023-08-14T12:47:22Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":"  As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":"  This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2211.08233v3","updated":"2023-08-14T11:50:53Z","published":"2022-11-14T13:35:01Z","title":"Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach\n  for Speech Emotion Recognition","summary":"  Speech emotion recognition (SER) plays a vital role in improving the\ninteractions between humans and machines by inferring human emotion and\naffective states from speech signals. Whereas recent works primarily focus on\nmining spatiotemporal information from hand-crafted features, we explore how to\nmodel the temporal patterns of speech emotions from dynamic temporal scales.\nTowards that goal, we introduce a novel temporal emotional modeling approach\nfor SER, termed Temporal-aware bI-direction Multi-scale Network (TIM-Net),\nwhich learns multi-scale contextual affective representations from various time\nscales. Specifically, TIM-Net first employs temporal-aware blocks to learn\ntemporal affective representation, then integrates complementary information\nfrom the past and the future to enrich contextual representations, and finally,\nfuses multiple time scale features for better adaptation to the emotional\nvariation. Extensive experimental results on six benchmark SER datasets\ndemonstrate the superior performance of TIM-Net, gaining 2.34% and 2.61%\nimprovements of the average UAR and WAR over the second-best on each corpus.\nThe source code is available at https://github.com/Jiaxin-Ye/TIM-Net_SER.\n","authors":["Jiaxin Ye","Xin-cheng Wen","Yujie Wei","Yong Xu","Kunhong Liu","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2211.08233v3.pdf","comment":"ICASSP 2023"},{"id":"http://arxiv.org/abs/2308.07081v1","updated":"2023-08-14T11:26:25Z","published":"2023-08-14T11:26:25Z","title":"Aesthetics of Sanskrit Poetry from the Perspective of Computational\n  Linguistics: A Case Study Analysis on Siksastaka","summary":"  Sanskrit poetry has played a significant role in shaping the literary and\ncultural landscape of the Indian subcontinent for centuries. However, not much\nattention has been devoted to uncovering the hidden beauty of Sanskrit poetry\nin computational linguistics. This article explores the intersection of\nSanskrit poetry and computational linguistics by proposing a roadmap of an\ninterpretable framework to analyze and classify the qualities and\ncharacteristics of fine Sanskrit poetry. We discuss the rich tradition of\nSanskrit poetry and the significance of computational linguistics in\nautomatically identifying the characteristics of fine poetry. The proposed\nframework involves a human-in-the-loop approach that combines deterministic\naspects delegated to machines and deep semantics left to human experts. We\nprovide a deep analysis of Siksastaka, a Sanskrit poem, from the perspective of\n6 prominent kavyashastra schools, to illustrate the proposed framework.\nAdditionally, we provide compound, dependency, anvaya (prose order linearised\nform), meter, rasa (mood), alankar (figure of speech), and riti (writing style)\nannotations for Siksastaka and a web application to illustrate the poem's\nanalysis and annotations. Our key contributions include the proposed framework,\nthe analysis of Siksastaka, the annotations and the web application for future\nresearch. Link for interactive analysis:\nhttps://sanskritshala.github.io/shikshastakam/\n","authors":["Jivnesh Sandhan","Amruta Barbadikar","Malay Maity","Pavankumar Satuluri","Tushar Sandhan","Ravi M. Gupta","Pawan Goyal","Laxmidhar Behera"],"pdf_url":"https://arxiv.org/pdf/2308.07081v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.07074v1","updated":"2023-08-14T11:16:28Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Diversity and Complexity Analysis","summary":"  Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01776v2","updated":"2023-08-14T09:15:42Z","published":"2023-08-03T14:09:31Z","title":"Does Correction Remain A Problem For Large Language Models?","summary":"  As large language models, such as GPT, continue to advance the capabilities\nof natural language processing (NLP), the question arises: does the problem of\ncorrection still persist? This paper investigates the role of correction in the\ncontext of large language models by conducting two experiments. The first\nexperiment focuses on correction as a standalone task, employing few-shot\nlearning techniques with GPT-like models for error correction. The second\nexperiment explores the notion of correction as a preparatory task for other\nNLP tasks, examining whether large language models can tolerate and perform\nadequately on texts containing certain levels of noise or errors. By addressing\nthese experiments, we aim to shed light on the significance of correction in\nthe era of large language models and its implications for various NLP\napplications.\n","authors":["Xiaowu Zhang","Xiaotian Zhang","Cheng Yang","Hang Yan","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.01776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01861v2","updated":"2023-08-14T09:07:00Z","published":"2023-08-03T16:31:02Z","title":"ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on\n  Class-level Code Generation","summary":"  In this work, we make the first attempt to evaluate LLMs in a more\nchallenging code generation scenario, i.e. class-level code generation. We\nfirst manually construct the first class-level code generation benchmark\nClassEval of 100 class-level Python code generation tasks with approximately\n500 person-hours. Based on it, we then perform the first study of 11\nstate-of-the-art LLMs on class-level code generation. Based on our results, we\nhave the following main findings. First, we find that all existing LLMs show\nmuch worse performance on class-level code generation compared to on standalone\nmethod-level code generation benchmarks like HumanEval; and the method-level\ncoding ability cannot equivalently reflect the class-level coding ability among\nLLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior\nthan other LLMs on class-level code generation, and the second-tier models\nincludes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very\nsimilar performance. Third, we find that generating the entire class all at\nonce (i.e. holistic generation strategy) is the best generation strategy only\nfor GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and\ncompositional) is better strategies for the other models with limited ability\nof understanding long instructions and utilizing the middle information.\nLastly, we find the limited model ability of generating method-dependent code\nand discuss the frequent error types in generated classes. Our benchmark is\navailable at https://github.com/FudanSELab/ClassEval.\n","authors":["Xueying Du","Mingwei Liu","Kaixin Wang","Hanlin Wang","Junwei Liu","Yixuan Chen","Jiayi Feng","Chaofeng Sha","Xin Peng","Yiling Lou"],"pdf_url":"https://arxiv.org/pdf/2308.01861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05221v2","updated":"2023-08-14T08:39:37Z","published":"2023-03-09T12:50:34Z","title":"SEAM: An Integrated Activation-Coupled Model of Sentence Processing and\n  Eye Movements in Reading","summary":"  Models of eye-movement control during reading, developed largely within\npsychology, usually focus on visual, attentional, lexical, and motor processes\nbut neglect post-lexical language processing; by contrast, models of sentence\ncomprehension processes, developed largely within psycholinguistics, generally\nfocus only on post-lexical language processes. We present a model that combines\nthese two research threads, by integrating eye-movement control and sentence\nprocessing. Developing such an integrated model is extremely challenging and\ncomputationally demanding, but such an integration is an important step toward\ncomplete mathematical models of natural language comprehension in reading. We\ncombine the SWIFT model of eye-movement control (Seelig et al., 2020,\ndoi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth\nsentence processing model (Lewis & Vasishth, 2005,\ndoi:10.1207/s15516709cog0000_25). This integration becomes possible, for the\nfirst time, due in part to recent advances in successful parameter\nidentification in dynamical models, which allows us to investigate profile\nlog-likelihoods for individual model parameters. We present a fully implemented\nproof-of-concept model demonstrating how such an integrated model can be\nachieved; our approach includes Bayesian model inference with Markov Chain\nMonte Carlo (MCMC) sampling as a key computational tool. The integrated model,\nSEAM, can successfully reproduce eye movement patterns that arise due to\nsimilarity-based interference in reading. To our knowledge, this is the\nfirst-ever integration of a complete process model of eye-movement control with\nlinguistic dependency completion processes in sentence comprehension. In future\nwork, this proof of concept model will need to be evaluated using a\ncomprehensive set of benchmark data.\n","authors":["Maximilian M. Rabe","Dario Paape","Daniela Mertzen","Shravan Vasishth","Ralf Engbert"],"pdf_url":"https://arxiv.org/pdf/2303.05221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00304v2","updated":"2023-08-14T08:11:15Z","published":"2023-08-01T05:54:12Z","title":"Skills-in-Context Prompting: Unlocking Compositionality in Large\n  Language Models","summary":"  We consider the problem of eliciting compositional generalization\ncapabilities in large language models (LLMs) with a novel type of prompting\nstrategy. Compositional generalization empowers the LLMs to solve problems that\nare harder than the ones they have seen (i.e., easy-to-hard generalization),\nwhich is a critical reasoning capability of human-like intelligence. However,\neven the current state-of-the-art LLMs still struggle with this form of\nreasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting,\nwhich instructs LLMs how to compose basic skills to resolve more complex\nproblems. We find that it is crucial to demonstrate both the skills and the\ncompositional examples within the same prompting context. With as few as two\nexamplars, our SKiC prompting initiates strong synergies between skills and\ntheir composition capabilities. Notably, it empowers LLMs to solve unseen\nproblems that require innovative skill compositions, achieving near-perfect\ngeneralization on a broad range of challenging compositionality tasks.\nIntriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling\nthem to leverage pre-existing internal skills acquired during earlier\npre-training stages, even when these skills are not explicitly presented in the\nprompting context. This results in the capability of LLMs to solve unseen\ncomplex problems by activating and composing internal competencies. With such\nprominent features, SKiC prompting is able to achieve state-of-the-art\nperformance on challenging mathematical reasoning benchmarks (e.g., MATH).\n","authors":["Jiaao Chen","Xiaoman Pan","Dian Yu","Kaiqiang Song","Xiaoyang Wang","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.00304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2308.06111v2","updated":"2023-08-14T07:45:17Z","published":"2023-08-11T12:55:09Z","title":"Improving Zero-Shot Text Matching for Financial Auditing with Large\n  Language Models","summary":"  Auditing financial documents is a very tedious and time-consuming process. As\nof today, it can already be simplified by employing AI-based solutions to\nrecommend relevant text passages from a report for each legal requirement of\nrigorous accounting standards. However, these methods need to be fine-tuned\nregularly, and they require abundant annotated data, which is often lacking in\nindustrial environments. Hence, we present ZeroShotALI, a novel recommender\nsystem that leverages a state-of-the-art large language model (LLM) in\nconjunction with a domain-specifically optimized transformer-based\ntext-matching solution. We find that a two-step approach of first retrieving a\nnumber of best matching document sections per legal requirement with a custom\nBERT-based model and second filtering these selections using an LLM yields\nsignificant performance improvements over existing approaches.\n","authors":["Lars Hillebrand","Armin Berger","Tobias Deußer","Tim Dilmaghani","Mohamed Khaled","Bernd Kliem","Rüdiger Loitz","Maren Pielka","David Leonhard","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.06111v2.pdf","comment":"Accepted at DocEng 2023, 4 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2308.06975v1","updated":"2023-08-14T07:20:49Z","published":"2023-08-14T07:20:49Z","title":"Can Knowledge Graphs Simplify Text?","summary":"  Knowledge Graph (KG)-to-Text Generation has seen recent improvements in\ngenerating fluent and informative sentences which describe a given KG. As KGs\nare widespread across multiple domains and contain important entity-relation\ninformation, and as text simplification aims to reduce the complexity of a text\nwhile preserving the meaning of the original text, we propose KGSimple, a novel\napproach to unsupervised text simplification which infuses KG-established\ntechniques in order to construct a simplified KG path and generate a concise\ntext which preserves the original input's meaning. Through an iterative and\nsampling KG-first approach, our model is capable of simplifying text when\nstarting from a KG by learning to keep important information while harnessing\nKG-to-text generation to output fluent and descriptive sentences. We evaluate\nvarious settings of the KGSimple model on currently-available KG-to-text\ndatasets, demonstrating its effectiveness compared to unsupervised text\nsimplification models which start with a given complex text. Our code is\navailable on GitHub.\n","authors":["Anthony Colas","Haodi Ma","Xuanli He","Yang Bai","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06975v1.pdf","comment":"Accepted as a Main Conference Long Paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.06966v1","updated":"2023-08-14T06:49:53Z","published":"2023-08-14T06:49:53Z","title":"EcomGPT: Instruction-tuning Large Language Model with Chain-of-Task\n  Tasks for E-commerce","summary":"  Recently, instruction-following Large Language Models (LLMs) , represented by\nChatGPT, have exhibited exceptional performance in general Natural Language\nProcessing (NLP) tasks. However, the unique characteristics of E-commerce data\npose significant challenges to general LLMs. An LLM tailored specifically for\nE-commerce scenarios, possessing robust cross-dataset/task generalization\ncapabilities, is a pressing necessity. To solve this issue, in this work, we\nproposed the first e-commerce instruction dataset EcomInstruct, with a total of\n2.5 million instruction data. EcomInstruct scales up the data size and task\ndiversity by constructing atomic tasks with E-commerce basic data types, such\nas product information, user reviews. Atomic tasks are defined as intermediate\ntasks implicitly involved in solving a final task, which we also call\nChain-of-Task tasks. We developed EcomGPT with different parameter scales by\ntraining the backbone model BLOOMZ with the EcomInstruct. Benefiting from the\nfundamental semantic understanding capabilities acquired from the Chain-of-Task\ntasks, EcomGPT exhibits excellent zero-shot generalization capabilities.\nExtensive experiments and human evaluations demonstrate that EcomGPT\noutperforms ChatGPT in term of cross-dataset/task generalization on E-commerce\ntasks.\n","authors":["Yangning Li","Shirong Ma","Xiaobin Wang","Shen Huang","Chengyue Jiang","Hai-Tao Zheng","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06966v1.pdf","comment":"Initial version of EcomGPT"},{"id":"http://arxiv.org/abs/2308.06953v1","updated":"2023-08-14T06:09:51Z","published":"2023-08-14T06:09:51Z","title":"Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained\n  Text Evaluation","summary":"  Fine-grained, span-level human evaluation has emerged as a reliable and\nrobust method for evaluating text generation tasks such as summarization,\nsimplification, machine translation and news generation, and the derived\nannotations have been useful for training automatic metrics and improving\nlanguage models. However, existing annotation tools implemented for these\nevaluation frameworks lack the adaptability to be extended to different domains\nor languages, or modify annotation settings according to user needs. And the\nabsence of a unified annotated data format inhibits the research in multi-task\nlearning. In this paper, we introduce Thresh, a unified, customizable and\ndeployable platform for fine-grained evaluation. By simply creating a YAML\nconfiguration file, users can build and test an annotation interface for any\nframework within minutes -- all in one web browser window. To facilitate\ncollaboration and sharing, Thresh provides a community hub that hosts a\ncollection of fine-grained frameworks and corresponding annotations made and\ncollected by the community, covering a wide range of NLP tasks. For deployment,\nThresh offers multiple options for any scale of annotation projects from small\nmanual inspections to large crowdsourcing ones. Additionally, we introduce a\nPython library to streamline the entire process from typology design and\ndeployment to annotation processing. Thresh is publicly accessible at\nhttps://thresh.tools.\n","authors":["David Heineman","Yao Dou","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.06953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06942v1","updated":"2023-08-14T05:22:33Z","published":"2023-08-14T05:22:33Z","title":"Approximating Human-Like Few-shot Learning with GPT-based Compression","summary":"  In this work, we conceptualize the learning process as information\ncompression. We seek to equip generative pre-trained models with human-like\nlearning capabilities that enable data compression during inference. We present\na novel approach that utilizes the Generative Pre-trained Transformer (GPT) to\napproximate Kolmogorov complexity, with the aim of estimating the optimal\nInformation Distance for few-shot learning. We first propose using GPT as a\nprior for lossless text compression, achieving a noteworthy compression ratio.\nExperiment with LLAMA2-7B backbone achieves a compression ratio of 15.5 on\nenwik9. We justify the pre-training objective of GPT models by demonstrating\nits equivalence to the compression length, and, consequently, its ability to\napproximate the information distance for texts. Leveraging the approximated\ninformation distance, our method allows the direct application of GPT models in\nquantitative text similarity measurements. Experiment results show that our\nmethod overall achieves superior performance compared to embedding and prompt\nbaselines on challenging NLP tasks, including semantic similarity, zero and\none-shot text classification, and zero-shot text ranking.\n","authors":["Cynthia Huang","Yuqing Xie","Zhiying Jiang","Jimmy Lin","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.06942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06912v1","updated":"2023-08-14T03:14:38Z","published":"2023-08-14T03:14:38Z","title":"CausalLM is not optimal for in-context learning","summary":"  Recent empirical evidence indicates that transformer based in-context\nlearning performs better when using a prefix language model (prefixLM), in\nwhich in-context samples can all attend to each other, compared to causal\nlanguage models (causalLM), which use auto-regressive attention that prohibits\nin-context samples to attend to future samples. While this result is intuitive,\nit is not understood from a theoretical perspective. In this paper we take a\ntheoretical approach and analyze the convergence behavior of prefixLM and\ncausalLM under a certain parameter construction. Our analysis shows that both\nLM types converge to their stationary points at a linear rate, but that while\nprefixLM converges to the optimal solution of linear regression, causalLM\nconvergence dynamics follows that of an online gradient descent algorithm,\nwhich is not guaranteed to be optimal even as the number of samples grows\ninfinitely. We supplement our theoretical claims with empirical experiments\nover synthetic and real tasks and using various types of transformers. Our\nexperiments verify that causalLM consistently underperforms prefixLM in all\nsettings.\n","authors":["Nan Ding","Tomer Levinboim","Jialin Wu","Sebastian Goodman","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2308.06912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v1","updated":"2023-08-14T03:12:29Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n  Graph, Image, and Text","summary":"  Large language models have made significant strides in natural language\nprocessing, paving the way for innovative applications including molecular\nrepresentation and generation. However, most existing single-modality\napproaches cannot capture the abundant and complex information in molecular\ndata. Here, we introduce GIT-Mol, a multi-modal large language model that\nintegrates the structure Graph, Image, and Text information, including the\nSimplified Molecular Input Line Entry System (SMILES) and molecular captions.\nTo facilitate the integration of multi-modal molecular data, we propose\nGIT-Former, a novel architecture capable of mapping all modalities into a\nunified latent space. Our study develops an innovative any-to-language\nmolecular translation strategy and achieves a 10%-15% improvement in molecular\ncaptioning, a 5%-10% accuracy increase in property prediction, and a 20% boost\nin molecule generation validity compared to baseline or single-modality models.\n","authors":["Pengfei Liu","Yiming Ren","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.03549v2","updated":"2023-08-14T02:59:52Z","published":"2023-08-07T12:56:13Z","title":"Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language\n  Model through Expert Feedback and Real-world Multi-turn Dialogue","summary":"  Recent advances in Large Language Models (LLMs) have achieved remarkable\nbreakthroughs in understanding and responding to user intents. However, their\nperformance lag behind general use cases in some expertise domains, such as\nChinese medicine. Existing efforts to incorporate Chinese medicine into LLMs\nrely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue\ndata. These models lack the ability for doctor-like proactive inquiry and\nmulti-turn comprehension and cannot always align responses with safety and\nprofessionalism experts. In this work, we introduce Zhongjing, the first\nChinese medical LLaMA-based LLM that implements an entire training pipeline\nfrom pre-training to reinforcement learning with human feedback (RLHF).\nAdditionally, we introduce a Chinese multi-turn medical dialogue dataset of\n70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly\nenhances the model's capability for complex dialogue and proactive inquiry\ninitiation. We define a refined annotation rule and evaluation criteria given\nthe biomedical domain's unique characteristics. Results show that our model\noutperforms baselines in various capacities and matches the performance of\nChatGPT in a few abilities, despite having 50x training data with previous best\nmodel and 100x parameters with ChatGPT. RLHF further improves the model's\ninstruction-following ability and safety.We also release our code, datasets and\nmodel for further research.\n","authors":["Songhua Yang","Hanjie Zhao","Senbin Zhu","Guangyu Zhou","Hongfei Xu","Yuxiang Jia","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2308.03549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06907v1","updated":"2023-08-14T02:59:27Z","published":"2023-08-14T02:59:27Z","title":"Generative Interpretation","summary":"  We introduce generative interpretation, a new approach to estimating\ncontractual meaning using large language models. As AI triumphalism is the\norder of the day, we proceed by way of grounded case studies, each illustrating\nthe capabilities of these novel tools in distinct ways. Taking well-known\ncontracts opinions, and sourcing the actual agreements that they adjudicated,\nwe show that AI models can help factfinders ascertain ordinary meaning in\ncontext, quantify ambiguity, and fill gaps in parties' agreements. We also\nillustrate how models can calculate the probative value of individual pieces of\nextrinsic evidence. After offering best practices for the use of these models\ngiven their limitations, we consider their implications for judicial practice\nand contract theory. Using LLMs permits courts to estimate what the parties\nintended cheaply and accurately, and as such generative interpretation\nunsettles the current interpretative stalemate. Their use responds to\nefficiency-minded textualists and justice-oriented contextualists, who argue\nabout whether parties will prefer cost and certainty or accuracy and fairness.\nParties--and courts--would prefer a middle path, in which adjudicators strive\nto predict what the contract really meant, admitting just enough context to\napproximate reality while avoiding unguided and biased assimilation of\nevidence. As generative interpretation offers this possibility, we argue it can\nbecome the new workhorse of contractual interpretation.\n","authors":["Yonathan A. Arbel","David Hoffman"],"pdf_url":"https://arxiv.org/pdf/2308.06907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06873v1","updated":"2023-08-14T01:01:19Z","published":"2023-08-14T01:01:19Z","title":"SpeechX: Neural Codec Language Model as a Versatile Speech Transformer","summary":"  Recent advancements in generative speech models based on audio-text prompts\nhave enabled remarkable innovations like high-quality zero-shot text-to-speech.\nHowever, existing models still face limitations in handling diverse audio-text\nspeech generation tasks involving transforming input speech and processing\naudio captured in adverse acoustic conditions. This paper introduces SpeechX, a\nversatile speech generation model capable of zero-shot TTS and various speech\ntransformation tasks, dealing with both clean and noisy signals. SpeechX\ncombines neural codec language modeling with multi-task learning using\ntask-dependent prompting, enabling unified and extensible modeling and\nproviding a consistent way for leveraging textual input in speech enhancement\nand transformation tasks. Experimental results show SpeechX's efficacy in\nvarious tasks, including zero-shot TTS, noise suppression, target speaker\nextraction, speech removal, and speech editing with or without background\nnoise, achieving comparable or superior performance to specialized models\nacross tasks. See https://aka.ms/speechx for demo samples.\n","authors":["Xiaofei Wang","Manthan Thakker","Zhuo Chen","Naoyuki Kanda","Sefik Emre Eskimez","Sanyuan Chen","Min Tang","Shujie Liu","Jinyu Li","Takuya Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2308.06873v1.pdf","comment":"See https://aka.ms/speechx for demo samples"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.07316v1","updated":"2023-08-14T17:59:31Z","published":"2023-08-14T17:59:31Z","title":"Jurassic World Remake: Bringing Ancient Fossils Back to Life via\n  Zero-Shot Long Image-to-Image Translation","summary":"  With a strong understanding of the target domain from natural language, we\nproduce promising results in translating across large domain gaps and bringing\nskeletons back to life. In this work, we use text-guided latent diffusion\nmodels for zero-shot image-to-image translation (I2I) across large domain gaps\n(longI2I), where large amounts of new visual features and new geometry need to\nbe generated to enter the target domain. Being able to perform translations\nacross large domain gaps has a wide variety of real-world applications in\ncriminology, astrology, environmental conservation, and paleontology. In this\nwork, we introduce a new task Skull2Animal for translating between skulls and\nliving animals. On this task, we find that unguided Generative Adversarial\nNetworks (GANs) are not capable of translating across large domain gaps.\nInstead of these traditional I2I methods, we explore the use of guided\ndiffusion and image editing models and provide a new benchmark model,\nRevive-2I, capable of performing zero-shot I2I via text-prompting latent\ndiffusion models. We find that guidance is necessary for longI2I because, to\nbridge the large domain gap, prior knowledge about the target domain is needed.\nIn addition, we find that prompting provides the best and most scalable\ninformation about the target domain as classifier-guided diffusion models\nrequire retraining for specific use cases and lack stronger constraints on the\ntarget domain because of the wide variety of images they are trained on.\n","authors":["Alexander Martin","Haitian Zheng","Jie An","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07316v1.pdf","comment":"9 pages, 10 figures, ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.07314v1","updated":"2023-08-14T17:58:33Z","published":"2023-08-14T17:58:33Z","title":"Dual Associated Encoder for Face Restoration","summary":"  Restoring facial details from low-quality (LQ) images has remained a\nchallenging problem due to its ill-posedness induced by various degradations in\nthe wild. The existing codebook prior mitigates the ill-posedness by leveraging\nan autoencoder and learned codebook of high-quality (HQ) features, achieving\nremarkable quality. However, existing approaches in this paradigm frequently\ndepend on a single encoder pre-trained on HQ data for restoring HQ images,\ndisregarding the domain gap between LQ and HQ images. As a result, the encoding\nof LQ inputs may be insufficient, resulting in suboptimal performance. To\ntackle this problem, we propose a novel dual-branch framework named DAEFR. Our\nmethod introduces an auxiliary LQ branch that extracts crucial information from\nthe LQ inputs. Additionally, we incorporate association training to promote\neffective synergy between the two branches, enhancing code prediction and\noutput quality. We evaluate the effectiveness of DAEFR on both synthetic and\nreal-world datasets, demonstrating its superior performance in restoring facial\ndetails.\n","authors":["Yu-Ju Tsai","Yu-Lun Liu","Lu Qi","Kelvin C. K. Chan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07314v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.07313v1","updated":"2023-08-14T17:58:04Z","published":"2023-08-14T17:58:04Z","title":"Group Pose: A Simple Baseline for End-to-End Multi-person Pose\n  Estimation","summary":"  In this paper, we study the problem of end-to-end multi-person pose\nestimation. State-of-the-art solutions adopt the DETR-like framework, and\nmainly develop the complex decoder, e.g., regarding pose estimation as keypoint\nbox detection and combining with human detection in ED-Pose, hierarchically\npredicting with pose decoder and joint (keypoint) decoder in PETR. We present a\nsimple yet effective transformer approach, named Group Pose. We simply regard\n$K$-keypoint pose estimation as predicting a set of $N\\times K$ keypoint\npositions, each from a keypoint query, as well as representing each pose with\nan instance query for scoring $N$ pose predictions. Motivated by the intuition\nthat the interaction, among across-instance queries of different types, is not\ndirectly helpful, we make a simple modification to decoder self-attention. We\nreplace single self-attention over all the $N\\times(K+1)$ queries with two\nsubsequent group self-attentions: (i) $N$ within-instance self-attention, with\neach over $K$ keypoint queries and one instance query, and (ii) $(K+1)$\nsame-type across-instance self-attention, each over $N$ queries of the same\ntype. The resulting decoder removes the interaction among across-instance\ntype-different queries, easing the optimization and thus improving the\nperformance. Experimental results on MS COCO and CrowdPose show that our\napproach without human box supervision is superior to previous methods with\ncomplex decoders, and even is slightly better than ED-Pose that uses human box\nsupervision. $\\href{https://github.com/Michel-liu/GroupPose-Paddle}{\\rm\nPaddle}$ and $\\href{https://github.com/Michel-liu/GroupPose}{\\rm PyTorch}$ code\nare available.\n","authors":["Huan Liu","Qiang Chen","Zichang Tan","Jiang-Jiang Liu","Jian Wang","Xiangbo Su","Xiaolong Li","Kun Yao","Junyu Han","Errui Ding","Yao Zhao","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07313v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07301v1","updated":"2023-08-14T17:39:44Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n  Synthesis","summary":"  The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://sites.google.com/view/estevevallsmascaro/publications/unimask-m.\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07298v1","updated":"2023-08-14T17:36:39Z","published":"2023-08-14T17:36:39Z","title":"Accurate Eye Tracking from Dense 3D Surface Reconstructions using\n  Single-Shot Deflectometry","summary":"  Eye-tracking plays a crucial role in the development of virtual reality\ndevices, neuroscience research, and psychology. Despite its significance in\nnumerous applications, achieving an accurate, robust, and fast eye-tracking\nsolution remains a considerable challenge for current state-of-the-art methods.\nWhile existing reflection-based techniques (e.g., \"glint tracking\") are\nconsidered the most accurate, their performance is limited by their reliance on\nsparse 3D surface data acquired solely from the cornea surface. In this paper,\nwe rethink the way how specular reflections can be used for eye tracking: We\npropose a novel method for accurate and fast evaluation of the gaze direction\nthat exploits teachings from single-shot phase-measuring-deflectometry (PMD).\nIn contrast to state-of-the-art reflection-based methods, our method acquires\ndense 3D surface information of both cornea and sclera within only one single\ncamera frame (single-shot). Improvements in acquired reflection surface\npoints(\"glints\") of factors $>3300 \\times$ are easily achievable. We show the\nfeasibility of our approach with experimentally evaluated gaze errors of only\n$\\leq 0.25^\\circ$ demonstrating a significant improvement over the current\nstate-of-the-art.\n","authors":["Jiazhang Wang","Tianfu Wang","Bingjie Xu","Oliver Cossairt And Florian Willomitzer"],"pdf_url":"https://arxiv.org/pdf/2308.07298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07279v1","updated":"2023-08-14T17:11:17Z","published":"2023-08-14T17:11:17Z","title":"A Robust Approach Towards Distinguishing Natural and Computer Generated\n  Images using Multi-Colorspace fused and Enriched Vision Transformer","summary":"  The works in literature classifying natural and computer generated images are\nmostly designed as binary tasks either considering natural images versus\ncomputer graphics images only or natural images versus GAN generated images\nonly, but not natural images versus both classes of the generated images. Also,\neven though this forensic classification task of distinguishing natural and\ncomputer generated images gets the support of the new convolutional neural\nnetworks and transformer based architectures that can give remarkable\nclassification accuracies, they are seen to fail over the images that have\nundergone some post-processing operations usually performed to deceive the\nforensic algorithms, such as JPEG compression, gaussian noise, etc. This work\nproposes a robust approach towards distinguishing natural and computer\ngenerated images including both, computer graphics and GAN generated images\nusing a fusion of two vision transformers where each of the transformer\nnetworks operates in different color spaces, one in RGB and the other in YCbCr\ncolor space. The proposed approach achieves high performance gain when compared\nto a set of baselines, and also achieves higher robustness and generalizability\nthan the baselines. The features of the proposed model when visualized are seen\nto obtain higher separability for the classes than the input image features and\nthe baseline features. This work also studies the attention map visualizations\nof the networks of the fused model and observes that the proposed methodology\ncan capture more image information relevant to the forensic task of classifying\nnatural and generated images.\n","authors":["Manjary P Gangan","Anoop Kadan","Lajish V L"],"pdf_url":"https://arxiv.org/pdf/2308.07279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n  Models","summary":"  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2308.07267v1","updated":"2023-08-14T16:50:27Z","published":"2023-08-14T16:50:27Z","title":"Diving with Penguins: Detecting Penguins and their Prey in Animal-borne\n  Underwater Videos via Deep Learning","summary":"  African penguins (Spheniscus demersus) are an endangered species. Little is\nknown regarding their underwater hunting strategies and associated predation\nsuccess rates, yet this is essential for guiding conservation. Modern\nbio-logging technology has the potential to provide valuable insights, but\nmanually analysing large amounts of data from animal-borne video recorders\n(AVRs) is time-consuming. In this paper, we publish an animal-borne underwater\nvideo dataset of penguins and introduce a ready-to-deploy deep learning system\ncapable of robustly detecting penguins (mAP50@98.0%) and also instances of fish\n(mAP50@73.3%). We note that the detectors benefit explicitly from air-bubble\nlearning to improve accuracy. Extending this detector towards a dual-stream\nbehaviour recognition network, we also provide the first results for\nidentifying predation behaviour in penguin underwater videos. Whilst results\nare promising, further work is required for useful applicability of predation\nbehaviour detection in field scenarios. In summary, we provide a highly\nreliable underwater penguin detector, a fish detector, and a valuable first\nattempt towards an automated visual detection of complex behaviours in a marine\npredator. We publish the networks, the DivingWithPenguins video dataset,\nannotations, splits, and weights for full reproducibility and immediate\nusability by practitioners.\n","authors":["Kejia Zhang","Mingyu Yang","Stephen D. J. Lang","Alistair M. McInnes","Richard B. Sherley","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2308.07267v1.pdf","comment":"5 pages, 5 figures, 4 Tables, \"3rd International Workshop on Camera\n  traps, AI, and Ecology (CamTrapAI)\""},{"id":"http://arxiv.org/abs/2308.07264v1","updated":"2023-08-14T16:48:57Z","published":"2023-08-14T16:48:57Z","title":"Efficient Real-time Smoke Filtration with 3D LiDAR for Search and Rescue\n  with Autonomous Heterogeneous Robotic Systems","summary":"  Search and Rescue (SAR) missions in harsh and unstructured Sub-Terranean\n(Sub-T) environments in the presence of aerosol particles have recently become\nthe main focus in the field of robotics. Aerosol particles such as smoke and\ndust directly affect the performance of any mobile robotic platform due to\ntheir reliance on their onboard perception systems for autonomous navigation\nand localization in Global Navigation Satellite System (GNSS)-denied\nenvironments. Although obstacle avoidance and object detection algorithms are\nrobust to the presence of noise to some degree, their performance directly\nrelies on the quality of captured data by onboard sensors such as Light\nDetection And Ranging (LiDAR) and camera. Thus, this paper proposes a novel\nmodular agnostic filtration pipeline based on intensity and spatial information\nsuch as local point density for removal of detected smoke particles from Point\nCloud (PCL) prior to its utilization for collision detection. Furthermore, the\nefficacy of the proposed framework in the presence of smoke during multiple\nfrontier exploration missions is investigated while the experimental results\nare presented to facilitate comparison with other methodologies and their\ncomputational impact. This provides valuable insight to the research community\nfor better utilization of filtration schemes based on available computation\nresources while considering the safe autonomous navigation of mobile robots.\n","authors":["Alexander Kyuroson","Anton Koval","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2308.07264v1.pdf","comment":"Accepted in the 49th Annual Conference of the IEEE Industrial\n  Electronics Society [IECON2023]"},{"id":"http://arxiv.org/abs/2308.07251v1","updated":"2023-08-14T16:38:13Z","published":"2023-08-14T16:38:13Z","title":"Large-kernel Attention for Efficient and Robust Brain Lesion\n  Segmentation","summary":"  Vision transformers are effective deep learning models for vision tasks,\nincluding medical image segmentation. However, they lack efficiency and\ntranslational invariance, unlike convolutional neural networks (CNNs). To model\nlong-range interactions in 3D brain lesion segmentation, we propose an\nall-convolutional transformer block variant of the U-Net architecture. We\ndemonstrate that our model provides the greatest compromise in three factors:\nperformance competitive with the state-of-the-art; parameter efficiency of a\nCNN; and the favourable inductive biases of a transformer. Our public\nimplementation is available at https://github.com/liamchalcroft/MDUNet .\n","authors":["Liam Chalcroft","Ruben Lourenço Pereira","Mikael Brudfors","Andrew S. Kayser","Mark D'Esposito","Cathy J. Price","Ioannis Pappas","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2308.07251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03202v2","updated":"2023-08-14T16:33:43Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":"  Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07243v1","updated":"2023-08-14T16:24:35Z","published":"2023-08-14T16:24:35Z","title":"AAFACE: Attribute-aware Attentional Network for Face Recognition","summary":"  In this paper, we present a new multi-branch neural network that\nsimultaneously performs soft biometric (SB) prediction as an auxiliary modality\nand face recognition (FR) as the main task. Our proposed network named AAFace\nutilizes SB attributes to enhance the discriminative ability of FR\nrepresentation. To achieve this goal, we propose an attribute-aware attentional\nintegration (AAI) module to perform weighted integration of FR with SB feature\nmaps. Our proposed AAI module is not only fully context-aware but also capable\nof learning complex relationships between input features by means of the\nsequential multi-scale channel and spatial sub-modules. Experimental results\nverify the superiority of our proposed network compared with the\nstate-of-the-art (SoTA) SB prediction and FR methods.\n","authors":["Niloufar Alipour Talemi","Hossein Kashiani","Sahar Rahimi Malakshan","Mohammad Saeed Ebrahimi Saadabadi","Nima Najafzadeh","Mohammad Akyash","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.07243v1.pdf","comment":"Accepted to $30^{th}$ IEEE International Conference on Image\n  Processing (ICIP 2023) as an oral presentation"},{"id":"http://arxiv.org/abs/2308.07234v1","updated":"2023-08-14T16:17:13Z","published":"2023-08-14T16:17:13Z","title":"UniWorld: Autonomous Driving Pre-training via World Models","summary":"  In this paper, we draw inspiration from Alberto Elfes' pioneering work in\n1989, where he introduced the concept of the occupancy grid as World Models for\nrobots. We imbue the robot with a spatial-temporal world model, termed\nUniWorld, to perceive its surroundings and predict the future behavior of other\nparticipants. UniWorld involves initially predicting 4D geometric occupancy as\nthe World Models for foundational stage and subsequently fine-tuning on\ndownstream tasks. UniWorld can estimate missing information concerning the\nworld state and predict plausible future states of the world. Besides,\nUniWorld's pre-training process is label-free, enabling the utilization of\nmassive amounts of image-LiDAR pairs to build a Foundational Model.The proposed\nunified pre-training framework demonstrates promising results in key tasks such\nas motion prediction, multi-camera 3D object detection, and surrounding\nsemantic scene completion. When compared to monocular pre-training methods on\nthe nuScenes dataset, UniWorld shows a significant improvement of about 1.5% in\nIoU for motion prediction, 2.0% in mAP and 2.0% in NDS for multi-camera 3D\nobject detection, as well as a 3% increase in mIoU for surrounding semantic\nscene completion. By adopting our unified pre-training method, a 25% reduction\nin 3D training annotation costs can be achieved, offering significant practical\nvalue for the implementation of real-world autonomous driving. Codes are\npublicly available at https://github.com/chaytonmin/UniWorld.\n","authors":["Chen Min","Dawei Zhao","Liang Xiao","Yiming Nie","Bin Dai"],"pdf_url":"https://arxiv.org/pdf/2308.07234v1.pdf","comment":"8 pages, 5 figures. arXiv admin note: substantial text overlap with\n  arXiv:2305.18829"},{"id":"http://arxiv.org/abs/2308.07228v1","updated":"2023-08-14T16:04:53Z","published":"2023-08-14T16:04:53Z","title":"RestoreFormer++: Towards Real-World Blind Face Restoration from\n  Undegraded Key-Value Pairs","summary":"  Blind face restoration aims at recovering high-quality face images from those\nwith unknown degradations. Current algorithms mainly introduce priors to\ncomplement high-quality details and achieve impressive progress. However, most\nof these algorithms ignore abundant contextual information in the face and its\ninterplay with the priors, leading to sub-optimal performance. Moreover, they\npay less attention to the gap between the synthetic and real-world scenarios,\nlimiting the robustness and generalization to real-world applications. In this\nwork, we propose RestoreFormer++, which on the one hand introduces\nfully-spatial attention mechanisms to model the contextual information and the\ninterplay with the priors, and on the other hand, explores an extending\ndegrading model to help generate more realistic degraded face images to\nalleviate the synthetic-to-real-world gap. Compared with current algorithms,\nRestoreFormer++ has several crucial benefits. First, instead of using a\nmulti-head self-attention mechanism like the traditional visual transformer, we\nintroduce multi-head cross-attention over multi-scale features to fully explore\nspatial interactions between corrupted information and high-quality priors. In\nthis way, it can facilitate RestoreFormer++ to restore face images with higher\nrealness and fidelity. Second, in contrast to the recognition-oriented\ndictionary, we learn a reconstruction-oriented dictionary as priors, which\ncontains more diverse high-quality facial details and better accords with the\nrestoration target. Third, we introduce an extending degrading model that\ncontains more realistic degraded scenarios for training data synthesizing, and\nthus helps to enhance the robustness and generalization of our RestoreFormer++\nmodel. Extensive experiments show that RestoreFormer++ outperforms\nstate-of-the-art algorithms on both synthetic and real-world datasets.\n","authors":["Zhouxia Wang","Jiawei Zhang","Tianshui Chen","Wenping Wang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07228v1.pdf","comment":"Submitted to TPAMI. An extension of RestoreFormer"},{"id":"http://arxiv.org/abs/2308.07225v1","updated":"2023-08-14T15:57:42Z","published":"2023-08-14T15:57:42Z","title":"DS-Depth: Dynamic and Static Depth Estimation via a Fusion Cost Volume","summary":"  Self-supervised monocular depth estimation methods typically rely on the\nreprojection error to capture geometric relationships between successive frames\nin static environments. However, this assumption does not hold in dynamic\nobjects in scenarios, leading to errors during the view synthesis stage, such\nas feature mismatch and occlusion, which can significantly reduce the accuracy\nof the generated depth maps. To address this problem, we propose a novel\ndynamic cost volume that exploits residual optical flow to describe moving\nobjects, improving incorrectly occluded regions in static cost volumes used in\nprevious work. Nevertheless, the dynamic cost volume inevitably generates extra\nocclusions and noise, thus we alleviate this by designing a fusion module that\nmakes static and dynamic cost volumes compensate for each other. In other\nwords, occlusion from the static volume is refined by the dynamic volume, and\nincorrect information from the dynamic volume is eliminated by the static\nvolume. Furthermore, we propose a pyramid distillation loss to reduce\nphotometric error inaccuracy at low resolutions and an adaptive photometric\nerror loss to alleviate the flow direction of the large gradient in the\nocclusion regions. We conducted extensive experiments on the KITTI and\nCityscapes datasets, and the results demonstrate that our model outperforms\npreviously published baselines for self-supervised monocular depth estimation.\n","authors":["Xingyu Miao","Yang Bai","Haoran Duan","Yawen Huang","Fan Wan","Xinxing Xu","Yang Long","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.07225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07223v1","updated":"2023-08-14T15:49:19Z","published":"2023-08-14T15:49:19Z","title":"Distance Matters For Improving Performance Estimation Under Covariate\n  Shift","summary":"  Performance estimation under covariate shift is a crucial component of safe\nAI model deployment, especially for sensitive use-cases. Recently, several\nsolutions were proposed to tackle this problem, most leveraging model\npredictions or softmax confidence to derive accuracy estimates. However, under\ndataset shifts, confidence scores may become ill-calibrated if samples are too\nfar from the training distribution. In this work, we show that taking into\naccount distances of test samples to their expected training distribution can\nsignificantly improve performance estimation under covariate shift. Precisely,\nwe introduce a \"distance-check\" to flag samples that lie too far from the\nexpected distribution, to avoid relying on their untrustworthy model outputs in\nthe accuracy estimation step. We demonstrate the effectiveness of this method\non 13 image classification tasks, across a wide-range of natural and synthetic\ndistribution shifts and hundreds of models, with a median relative MAE\nimprovement of 27% over the best baseline across all tasks, and SOTA\nperformance on 10 out of 13 tasks. Our code is publicly available at\nhttps://github.com/melanibe/distance_matters_performance_estimation.\n","authors":["Mélanie Roschewitz","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.07223v1.pdf","comment":"Accepted to ICCV Workshop on Uncertainty Quantification for Computer\n  Vision 2023"},{"id":"http://arxiv.org/abs/2308.03900v2","updated":"2023-08-14T15:37:10Z","published":"2023-08-07T20:23:39Z","title":"Developability Approximation for Neural Implicits through Rank\n  Minimization","summary":"  Developability refers to the process of creating a surface without any\ntearing or shearing from a two-dimensional plane. It finds practical\napplications in the fabrication industry. An essential characteristic of a\ndevelopable 3D surface is its zero Gaussian curvature, which means that either\none or both of the principal curvatures are zero. This paper introduces a\nmethod for reconstructing an approximate developable surface from a neural\nimplicit surface. The central idea of our method involves incorporating a\nregularization term that operates on the second-order derivatives of the neural\nimplicits, effectively promoting zero Gaussian curvature. Implicit surfaces\noffer the advantage of smoother deformation with infinite resolution,\novercoming the high polygonal constraints of state-of-the-art methods using\ndiscrete representations. We draw inspiration from the properties of surface\ncurvature and employ rank minimization techniques derived from compressed\nsensing. Experimental results on both developable and non-developable surfaces,\nincluding those affected by noise, validate the generalizability of our method.\n","authors":["Pratheba Selvaraju"],"pdf_url":"https://arxiv.org/pdf/2308.03900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07214v1","updated":"2023-08-14T15:34:22Z","published":"2023-08-14T15:34:22Z","title":"Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel\n  Approach Using the BraTS AFRICA Challenge Data","summary":"  Brain tumors, particularly glioblastoma, continue to challenge medical\ndiagnostics and treatments globally. This paper explores the application of\ndeep learning to multi-modality magnetic resonance imaging (MRI) data for\nenhanced brain tumor segmentation precision in the Sub-Saharan Africa patient\npopulation. We introduce an ensemble method that comprises eleven unique\nvariations based on three core architectures: UNet3D, ONet3D, SphereNet3D and\nmodified loss functions. The study emphasizes the need for both age- and\npopulation-based segmentation models, to fully account for the complexities in\nthe brain. Our findings reveal that the ensemble approach, combining different\narchitectures, outperforms single models, leading to improved evaluation\nmetrics. Specifically, the results exhibit Dice scores of 0.82, 0.82, and 0.87\nfor enhancing tumor, tumor core, and whole tumor labels respectively. These\nresults underline the potential of tailored deep learning techniques in\nprecisely segmenting brain tumors and lay groundwork for future work to\nfine-tune models and assess performance across different brain regions.\n","authors":["Chiranjeewee Prasad Koirala","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07214v1.pdf","comment":"3 figs and 3 tables"},{"id":"http://arxiv.org/abs/2308.07212v1","updated":"2023-08-14T15:29:32Z","published":"2023-08-14T15:29:32Z","title":"Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel\n  Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data","summary":"  Brain tumors remain a critical global health challenge, necessitating\nadvancements in diagnostic techniques and treatment methodologies. In response\nto the growing need for age-specific segmentation models, particularly for\npediatric patients, this study explores the deployment of deep learning\ntechniques using magnetic resonance imaging (MRI) modalities. By introducing a\nnovel ensemble approach using ONet and modified versions of UNet, coupled with\ninnovative loss functions, this study achieves a precise segmentation model for\nthe BraTS-PEDs 2023 Challenge. Data augmentation, including both single and\ncomposite transformations, ensures model robustness and accuracy across\ndifferent scanning protocols. The ensemble strategy, integrating the ONet and\nUNet models, shows greater effectiveness in capturing specific features and\nmodeling diverse aspects of the MRI images which result in lesion_wise dice\nscores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor\nlabels respectively. Visual comparisons further confirm the superiority of the\nensemble method in accurate tumor region coverage. The results indicate that\nthis advanced ensemble approach, building upon the unique strengths of\nindividual models, offers promising prospects for enhanced diagnostic accuracy\nand effective treatment planning for brain tumors in pediatric brains.\n","authors":["Shashidhar Reddy Javaji","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07212v1.pdf","comment":"3 Figs, 3 Tables"},{"id":"http://arxiv.org/abs/2308.07209v1","updated":"2023-08-14T15:25:07Z","published":"2023-08-14T15:25:07Z","title":"Unified Data-Free Compression: Pruning and Quantization without\n  Fine-Tuning","summary":"  Structured pruning and quantization are promising approaches for reducing the\ninference time and memory footprint of neural networks. However, most existing\nmethods require the original training dataset to fine-tune the model. This not\nonly brings heavy resource consumption but also is not possible for\napplications with sensitive or proprietary data due to privacy and security\nconcerns. Therefore, a few data-free methods are proposed to address this\nproblem, but they perform data-free pruning and quantization separately, which\ndoes not explore the complementarity of pruning and quantization. In this\npaper, we propose a novel framework named Unified Data-Free Compression(UDFC),\nwhich performs pruning and quantization simultaneously without any data and\nfine-tuning process. Specifically, UDFC starts with the assumption that the\npartial information of a damaged(e.g., pruned or quantized) channel can be\npreserved by a linear combination of other channels, and then derives the\nreconstruction form from the assumption to restore the information loss due to\ncompression. Finally, we formulate the reconstruction error between the\noriginal network and its compressed network, and theoretically deduce the\nclosed-form solution. We evaluate the UDFC on the large-scale image\nclassification task and obtain significant improvements over various network\narchitectures and compression methods. For example, we achieve a 20.54%\naccuracy improvement on ImageNet dataset compared to SOTA method with 30%\npruning ratio and 6-bit quantization on ResNet-34.\n","authors":["Shipeng Bai","Jun Chen","Xintian Shen","Yixuan Qian","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07209v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.07207v1","updated":"2023-08-14T15:24:44Z","published":"2023-08-14T15:24:44Z","title":"FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on\n  Optical Flow","summary":"  Multiple object tracking (MOT) has been successfully investigated in computer\nvision.\n  However, MOT for the videos captured by unmanned aerial vehicles (UAV) is\nstill challenging due to small object size, blurred object appearance, and very\nlarge and/or irregular motion in both ground objects and UAV platforms.\n  In this paper, we propose FOLT to mitigate these problems and reach fast and\naccurate MOT in UAV view.\n  Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and\nlight-weight optical flow extractor to extract object detection features and\nmotion features at a minimum cost.\n  Given the extracted flow, the flow-guided feature augmentation is designed to\naugment the object detection feature based on its optical flow, which improves\nthe detection of small objects.\n  Then the flow-guided motion prediction is also proposed to predict the\nobject's position in the next frame, which improves the tracking performance of\nobjects with very large displacements between adjacent frames.\n  Finally, the tracker matches the detected objects and predicted objects using\na spatially matching scheme to generate tracks for every object.\n  Experiments on Visdrone and UAVDT datasets show that our proposed model can\nsuccessfully track small objects with large and irregular motion and outperform\nexisting state-of-the-art methods in UAV-MOT tasks.\n","authors":["Mufeng Yao","Jiaqi Wang","Jinlong Peng","Mingmin Chi","Chao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09571v2","updated":"2023-08-14T15:15:46Z","published":"2023-04-19T11:19:10Z","title":"SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability\n  for Learned Image Compression","summary":"  Recently, transformers are trending as replacements for CNNs in vision tasks,\nincluding compression. This trend compels us to question the inherent\nlimitations of CNNs compared to transformers and to explore if CNNs can be\nenhanced to achieve the same or even better performance than transformers. We\nwant to design a pure CNN based model for compression as most devices are\noptimized for CNNs well. In our analysis, we find that the key strengths of\ntransformers lie in their dynamic weights and large receptive fields. To enable\nCNNs with such properties, we propose a novel transform module with large\nreceptive filed learning and self-conditioned adaptability for learned image\ncompression, named SLIC. Specifically, we enlarge the receptive field of\ndepth-wise convolution with suitable complexity and generate the weights\naccording to given conditions. In addition, we also investigate the\nself-conditioned factor for channels. To prove the effectiveness of our\nproposed transform module, we equip it with existing entropy models ChARM,\nSCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten have significant improvements over corresponding baselines and\nachieve SOTA performances with suitable complexity on 5 test datasets (Kodak,\nTecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at\nhttps://github.com/JiangWeibeta/SLIC.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.07202v1","updated":"2023-08-14T15:14:37Z","published":"2023-08-14T15:14:37Z","title":"Towards Robust Real-Time Scene Text Detection: From Semantic to Instance\n  Representation Learning","summary":"  Due to the flexible representation of arbitrary-shaped scene text and simple\npipeline, bottom-up segmentation-based methods begin to be mainstream in\nreal-time scene text detection. Despite great progress, these methods show\ndeficiencies in robustness and still suffer from false positives and instance\nadhesion. Different from existing methods which integrate multiple-granularity\nfeatures or multiple outputs, we resort to the perspective of representation\nlearning in which auxiliary tasks are utilized to enable the encoder to jointly\nlearn robust features with the main task of per-pixel classification during\noptimization. For semantic representation learning, we propose global-dense\nsemantic contrast (GDSC), in which a vector is extracted for global semantic\nrepresentation, then used to perform element-wise contrast with the dense grid\nfeatures. To learn instance-aware representation, we propose to combine\ntop-down modeling (TDM) with the bottom-up framework to provide implicit\ninstance-level clues for the encoder. With the proposed GDSC and TDM, the\nencoder network learns stronger representation without introducing any\nparameters and computations during inference. Equipped with a very light\ndecoder, the detector can achieve more robust real-time scene text detection.\nExperimental results on four public datasets show that the proposed method can\noutperform or be comparable to the state-of-the-art on both accuracy and speed.\nSpecifically, the proposed method achieves 87.2% F-measure with 48.2 FPS on\nTotal-Text and 89.6% F-measure with 36.9 FPS on MSRA-TD500 on a single GeForce\nRTX 2080 Ti GPU.\n","authors":["Xugong Qin","Pengyuan Lyu","Chengquan Zhang","Yu Zhou","Kun Yao","Peng Zhang","Hailun Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07202v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.10003v2","updated":"2023-08-14T15:09:45Z","published":"2023-06-16T17:56:16Z","title":"C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and\n  Generalizable Neural Surface Reconstruction","summary":"  There is an emerging effort to combine the two popular 3D frameworks using\nMulti-View Stereo (MVS) and Neural Implicit Surfaces (NIS) with a specific\nfocus on the few-shot / sparse view setting. In this paper, we introduce a\nnovel integration scheme that combines the multi-view stereo with neural signed\ndistance function representations, which potentially overcomes the limitations\nof both methods. MVS uses per-view depth estimation and cross-view fusion to\ngenerate accurate surfaces, while NIS relies on a common coordinate volume.\nBased on this strategy, we propose to construct per-view cost frustum for finer\ngeometry estimation, and then fuse cross-view frustums and estimate the\nimplicit signed distance functions to tackle artifacts that are due to noise\nand holes in the produced surface reconstruction. We further apply a cascade\nfrustum fusion strategy to effectively captures global-local information and\nstructural consistency. Finally, we apply cascade sampling and a\npseudo-geometric loss to foster stronger integration between the two\narchitectures. Extensive experiments demonstrate that our method reconstructs\nrobust surfaces and outperforms existing state-of-the-art methods.\n","authors":["Luoyuan Xu","Tao Guan","Yuesong Wang","Wenkai Liu","Zhaojie Zeng","Junle Wang","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2306.10003v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2302.07184v2","updated":"2023-08-14T14:49:27Z","published":"2023-02-14T16:52:26Z","title":"Point Cloud Registration for LiDAR and Photogrammetric Data: a Critical\n  Synthesis and Performance Analysis on Classic and Deep Learning Algorithms","summary":"  Recent advances in computer vision and deep learning have shown promising\nperformance in estimating rigid/similarity transformation between unregistered\npoint clouds of complex objects and scenes. However, their performances are\nmostly evaluated using a limited number of datasets from a single sensor (e.g.\nKinect or RealSense cameras), lacking a comprehensive overview of their\napplicability in photogrammetric 3D mapping scenarios. In this work, we provide\na comprehensive review of the state-of-the-art (SOTA) point cloud registration\nmethods, where we analyze and evaluate these methods using a diverse set of\npoint cloud data from indoor to satellite sources. The quantitative analysis\nallows for exploring the strengths, applicability, challenges, and future\ntrends of these methods. In contrast to existing analysis works that introduce\npoint cloud registration as a holistic process, our experimental analysis is\nbased on its inherent two-step process to better comprehend these approaches\nincluding feature/keypoint-based initial coarse registration and dense fine\nregistration through cloud-to-cloud (C2C) optimization. More than ten methods,\nincluding classic hand-crafted, deep-learning-based feature correspondence, and\nrobust C2C methods were tested. We observed that the success rate of most of\nthe algorithms are fewer than 40% over the datasets we tested and there are\nstill are large margin of improvement upon existing algorithms concerning 3D\nsparse corresopondence search, and the ability to register point clouds with\ncomplex geometry and occlusions. With the evaluated statistics on three\ndatasets, we conclude the best-performing methods for each step and provide our\nrecommendations, and outlook future efforts.\n","authors":["Ningli Xu","Rongjun Qin","Shuang Song"],"pdf_url":"https://arxiv.org/pdf/2302.07184v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2308.07180v1","updated":"2023-08-14T14:39:06Z","published":"2023-08-14T14:39:06Z","title":"SEMI-CenterNet: A Machine Learning Facilitated Approach for\n  Semiconductor Defect Inspection","summary":"  Continual shrinking of pattern dimensions in the semiconductor domain is\nmaking it increasingly difficult to inspect defects due to factors such as the\npresence of stochastic noise and the dynamic behavior of defect patterns and\ntypes. Conventional rule-based methods and non-parametric supervised machine\nlearning algorithms like KNN mostly fail at the requirements of semiconductor\ndefect inspection at these advanced nodes. Deep Learning (DL)-based methods\nhave gained popularity in the semiconductor defect inspection domain because\nthey have been proven robust towards these challenging scenarios. In this\nresearch work, we have presented an automated DL-based approach for efficient\nlocalization and classification of defects in SEM images. We have proposed\nSEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of\nsemiconductor wafer defects. The use of the proposed CN approach allows\nimproved computational efficiency compared to previously studied DL models.\nSEMI-CN gets trained to output the center, class, size, and offset of a defect\ninstance. This is different from the approach of most object detection models\nthat use anchors for bounding box prediction. Previous methods predict\nredundant bounding boxes, most of which are discarded in postprocessing. CN\nmitigates this by only predicting boxes for likely defect center points. We\ntrain SEMI-CN on two datasets and benchmark two ResNet backbones for the\nframework. Initially, ResNet models pretrained on the COCO dataset undergo\ntraining using two datasets separately. Primarily, SEMI-CN shows significant\nimprovement in inference time against previous research works. Finally,\ntransfer learning (using weights of custom SEM dataset) is applied from ADI\ndataset to AEI dataset and vice-versa, which reduces the required training time\nfor both backbones to reach the best mAP against conventional training method.\n","authors":["Vic De Ridder","Bappaditya Dey","Enrique Dehaerne","Sandip Halder","Stefan De Gendt","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2308.07180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13334v3","updated":"2023-08-14T14:35:02Z","published":"2023-02-26T15:34:05Z","title":"Knowledge Restore and Transfer for Multi-label Class-Incremental\n  Learning","summary":"  Current class-incremental learning research mainly focuses on single-label\nclassification tasks while multi-label class-incremental learning (MLCIL) with\nmore practical application scenarios is rarely studied. Although there have\nbeen many anti-forgetting methods to solve the problem of catastrophic\nforgetting in class-incremental learning, these methods have difficulty in\nsolving the MLCIL problem due to label absence and information dilution. In\nthis paper, we propose a knowledge restore and transfer (KRT) framework for\nMLCIL, which includes a dynamic pseudo-label (DPL) module to restore the old\nclass knowledge and an incremental cross-attention(ICA) module to save\nsession-specific knowledge and transfer old class knowledge to the new model\nsufficiently. Besides, we propose a token loss to jointly optimize the\nincremental cross-attention module. Experimental results on MS-COCO and PASCAL\nVOC datasets demonstrate the effectiveness of our method for improving\nrecognition performance and mitigating forgetting on multi-label\nclass-incremental learning tasks.\n","authors":["Songlin Dong","Haoyu Luo","Yuhang He","Xing Wei","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2302.13334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04668v2","updated":"2023-08-14T14:24:28Z","published":"2022-03-09T12:15:55Z","title":"Inadequately Pre-trained Models are Better Feature Extractors","summary":"  Pre-training has been a popular learning paradigm in deep learning era,\nespecially in annotation-insufficient scenario. Better ImageNet pre-trained\nmodels have been demonstrated, from the perspective of architecture, by\nprevious research to have better transferability to downstream tasks. However,\nin this paper, we found that during the same pre-training process, models at\nmiddle epochs, which is inadequately pre-trained, can outperform fully trained\nmodels when used as feature extractors (FE), while the fine-tuning (FT)\nperformance still grows with the source performance. This reveals that there is\nnot a solid positive correlation between top-1 accuracy on ImageNet and the\ntransferring result on target data. Based on the contradictory phenomenon\nbetween FE and FT that better feature extractor fails to be fine-tuned better\naccordingly, we conduct comprehensive analyses on features before softmax layer\nto provide insightful explanations. Our discoveries suggest that, during\npre-training, models tend to first learn spectral components corresponding to\nlarge singular values and the residual components contribute more when\nfine-tuning.\n","authors":["Andong Deng","Xingjian Li","Di Hu","Tianyang Wang","Haoyi Xiong","Chengzhong Xu"],"pdf_url":"https://arxiv.org/pdf/2203.04668v2.pdf","comment":"Accepted by ICCV'2023"},{"id":"http://arxiv.org/abs/2308.07163v1","updated":"2023-08-14T14:18:11Z","published":"2023-08-14T14:18:11Z","title":"HyperSparse Neural Networks: Shifting Exploration to Exploitation\n  through Adaptive Regularization","summary":"  Sparse neural networks are a key factor in developing resource-efficient\nmachine learning applications. We propose the novel and powerful sparse\nlearning method Adaptive Regularized Training (ART) to compress dense into\nsparse networks. Instead of the commonly used binary mask during training to\nreduce the number of model weights, we inherently shrink weights close to zero\nin an iterative manner with increasing weight regularization. Our method\ncompresses the pre-trained model knowledge into the weights of highest\nmagnitude. Therefore, we introduce a novel regularization loss named\nHyperSparse that exploits the highest weights while conserving the ability of\nweight exploration. Extensive experiments on CIFAR and TinyImageNet show that\nour method leads to notable performance gains compared to other sparsification\nmethods, especially in extremely high sparsity regimes up to 99.8 percent model\nsparsity. Additional investigations provide new insights into the patterns that\nare encoded in weights with high magnitudes.\n","authors":["Patrick Glandorf","Timo Kaiser","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2308.07163v1.pdf","comment":"ICCV'23 Workshops"},{"id":"http://arxiv.org/abs/2308.07156v1","updated":"2023-08-14T14:09:41Z","published":"2023-08-14T14:09:41Z","title":"SAM Meets Robotic Surgery: An Empirical Study on Generalization,\n  Robustness and Adaptation","summary":"  The Segment Anything Model (SAM) serves as a fundamental model for semantic\nsegmentation and demonstrates remarkable generalization capabilities across a\nwide range of downstream scenarios. In this empirical study, we examine SAM's\nrobustness and zero-shot generalizability in the field of robotic surgery. We\ncomprehensively explore different scenarios, including prompted and unprompted\nsituations, bounding box and points-based prompt approaches, as well as the\nability to generalize under corruptions and perturbations at five severity\nlevels. Additionally, we compare the performance of SAM with state-of-the-art\nsupervised models. We conduct all the experiments with two well-known robotic\ninstrument segmentation datasets from MICCAI EndoVis 2017 and 2018 challenges.\nOur extensive evaluation results reveal that although SAM shows remarkable\nzero-shot generalization ability with bounding box prompts, it struggles to\nsegment the whole instrument with point-based prompts and unprompted settings.\nFurthermore, our qualitative figures demonstrate that the model either failed\nto predict certain parts of the instrument mask (e.g., jaws, wrist) or\npredicted parts of the instrument as wrong classes in the scenario of\noverlapping instruments within the same bounding box or with the point-based\nprompt. In fact, SAM struggles to identify instruments in complex surgical\nscenarios characterized by the presence of blood, reflection, blur, and shade.\nAdditionally, SAM is insufficiently robust to maintain high performance when\nsubjected to various forms of data corruption. We also attempt to fine-tune SAM\nusing Low-rank Adaptation (LoRA) and propose SurgicalSAM, which shows the\ncapability in class-wise mask prediction without prompt. Therefore, we can\nargue that, without further domain-specific fine-tuning, SAM is not ready for\ndownstream surgical tasks.\n","authors":["An Wang","Mobarakol Islam","Mengya Xu","Yang Zhang","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.07156v1.pdf","comment":"Accepted as Oral Presentation at MedAGI Workshop - MICCAI 2023 1st\n  International Workshop on Foundation Models for General Medical AI. arXiv\n  admin note: substantial text overlap with arXiv:2304.14674"},{"id":"http://arxiv.org/abs/2308.07153v1","updated":"2023-08-14T14:06:21Z","published":"2023-08-14T14:06:21Z","title":"DELO: Deep Evidential LiDAR Odometry using Partial Optimal Transport","summary":"  Accurate, robust, and real-time LiDAR-based odometry (LO) is imperative for\nmany applications like robot navigation, globally consistent 3D scene map\nreconstruction, or safe motion-planning. Though LiDAR sensor is known for its\nprecise range measurement, the non-uniform and uncertain point sampling density\ninduce structural inconsistencies. Hence, existing supervised and unsupervised\npoint set registration methods fail to establish one-to-one matching\ncorrespondences between LiDAR frames. We introduce a novel deep learning-based\nreal-time (approx. 35-40ms per frame) LO method that jointly learns accurate\nframe-to-frame correspondences and model's predictive uncertainty (PU) as\nevidence to safe-guard LO predictions. In this work, we propose (i) partial\noptimal transportation of LiDAR feature descriptor for robust LO estimation,\n(ii) joint learning of predictive uncertainty while learning odometry over\ndriving sequences, and (iii) demonstrate how PU can serve as evidence for\nnecessary pose-graph optimization when LO network is either under or over\nconfident. We evaluate our method on KITTI dataset and show competitive\nperformance, even superior generalization ability over recent state-of-the-art\napproaches. Source codes are available.\n","authors":["Sk Aziz Ali","Djamila Aouada","Gerd Reis","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2308.07153v1.pdf","comment":"Accepted in ICCV 2023 Workshop"},{"id":"http://arxiv.org/abs/2304.01716v2","updated":"2023-08-14T14:02:15Z","published":"2023-04-04T11:25:44Z","title":"Decoupling Dynamic Monocular Videos for Dynamic View Synthesis","summary":"  The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,\nsynthesizing novel views for free viewpoints given a monocular video of a\ndynamic scene captured by a moving camera, mainly lies in accurately modeling\nthe dynamic objects of a scene using limited 2D frames, each with a varying\ntimestamp and viewpoint. Existing methods usually require pre-processed 2D\noptical flow and depth maps by off-the-shelf methods to supervise the network,\nmaking them suffer from the inaccuracy of the pre-processed supervision and the\nambiguity when lifting the 2D information to 3D. In this paper, we tackle this\nchallenge in an unsupervised fashion. Specifically, we decouple the motion of\nthe dynamic objects into object motion and camera motion, respectively\nregularized by proposed unsupervised surface consistency and patch-based\nmulti-view constraints. The former enforces the 3D geometric surfaces of moving\nobjects to be consistent over time, while the latter regularizes their\nappearances to be consistent across different viewpoints. Such a fine-grained\nmotion formulation can alleviate the learning difficulty for the network, thus\nenabling it to produce not only novel views with higher quality but also more\naccurate scene flows and depth than existing methods requiring extra\nsupervision.\n","authors":["Meng You","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2304.01716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07151v1","updated":"2023-08-14T13:59:04Z","published":"2023-08-14T13:59:04Z","title":"Diffusion Based Augmentation for Captioning and Retrieval in Cultural\n  Heritage","summary":"  Cultural heritage applications and advanced machine learning models are\ncreating a fruitful synergy to provide effective and accessible ways of\ninteracting with artworks. Smart audio-guides, personalized art-related content\nand gamification approaches are just a few examples of how technology can be\nexploited to provide additional value to artists or exhibitions. Nonetheless,\nfrom a machine learning point of view, the amount of available artistic data is\noften not enough to train effective models. Off-the-shelf computer vision\nmodules can still be exploited to some extent, yet a severe domain shift is\npresent between art images and standard natural image datasets used to train\nsuch models. As a result, this can lead to degraded performance. This paper\nintroduces a novel approach to address the challenges of limited annotated data\nand domain shifts in the cultural heritage domain. By leveraging generative\nvision-language models, we augment art datasets by generating diverse\nvariations of artworks conditioned on their captions. This augmentation\nstrategy enhances dataset diversity, bridging the gap between natural images\nand artworks, and improving the alignment of visual cues with knowledge from\ngeneral-purpose datasets. The generated variations assist in training vision\nand language models with a deeper understanding of artistic characteristics and\nthat are able to generate better captions with appropriate jargon.\n","authors":["Dario Cioni","Lorenzo Berlincioni","Federico Becattini","Alberto del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2308.07151v1.pdf","comment":"Accepted at ICCV 2023 4th Workshop on e-Heritage"},{"id":"http://arxiv.org/abs/2308.07146v1","updated":"2023-08-14T13:53:18Z","published":"2023-08-14T13:53:18Z","title":"CTP: Towards Vision-Language Continual Pretraining via Compatible\n  Momentum Contrast and Topology Preservation","summary":"  Vision-Language Pretraining (VLP) has shown impressive results on diverse\ndownstream tasks by offline training on large-scale datasets. Regarding the\ngrowing nature of real-world data, such an offline training paradigm on\never-expanding data is unsustainable, because models lack the continual\nlearning ability to accumulate knowledge constantly. However, most continual\nlearning studies are limited to uni-modal classification and existing\nmulti-modal datasets cannot simulate continual non-stationary data stream\nscenarios. To support the study of Vision-Language Continual Pretraining\n(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D\nwhich contains over one million product image-text pairs from 9 industries. The\ndata from each industry as an independent task supports continual learning and\nconforms to the real-world long-tail nature to simulate pretraining on web\ndata. We comprehensively study the characteristics and challenges of VLCP, and\npropose a new algorithm: Compatible momentum contrast with Topology\nPreservation, dubbed CTP. The compatible momentum model absorbs the knowledge\nof the current and previous-task models to flexibly update the modal feature.\nMoreover, Topology Preservation transfers the knowledge of embedding across\ntasks while preserving the flexibility of feature adjustment. The experimental\nresults demonstrate our method not only achieves superior performance compared\nwith other baselines but also does not bring an expensive training burden.\nDataset and codes are available at https://github.com/KevinLight831/CTP.\n","authors":["Hongguang Zhu","Yunchao Wei","Xiaodan Liang","Chunjie Zhang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.07146v1.pdf","comment":"Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP"},{"id":"http://arxiv.org/abs/2301.00135v3","updated":"2023-08-14T13:41:49Z","published":"2022-12-31T06:32:36Z","title":"TeViS:Translating Text Synopses to Video Storyboards","summary":"  A video storyboard is a roadmap for video creation which consists of\nshot-by-shot images to visualize key plots in a text synopsis. Creating video\nstoryboards, however, remains challenging which not only requires cross-modal\nassociation between high-level texts and images but also demands long-term\nreasoning to make transitions smooth across shots. In this paper, we propose a\nnew task called Text synopsis to Video Storyboard (TeViS) which aims to\nretrieve an ordered sequence of images as the video storyboard to visualize the\ntext synopsis. We construct a MovieNet-TeViS dataset based on the public\nMovieNet dataset. It contains 10K text synopses each paired with keyframes\nmanually selected from corresponding movies by considering both relevance and\ncinematic coherence. To benchmark the task, we present strong CLIP-based\nbaselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images\ninto a joint embedding space and uses vector quantization (VQ) to improve the\nvisual representation. Then, it auto-regressively generates a sequence of\nvisual features for retrieval and ordering. Experimental results demonstrate\nthat VQ-Trans significantly outperforms prior methods and the CLIP-based\nbaselines. Nevertheless, there is still a large gap compared to human\nperformance suggesting room for promising future work. The code and data are\navailable at: \\url{https://ruc-aimind.github.io/projects/TeViS/}\n","authors":["Xu Gu","Yuchong Sun","Feiyue Ni","Shizhe Chen","Xihua Wang","Ruihua Song","Boyuan Li","Xiang Cao"],"pdf_url":"https://arxiv.org/pdf/2301.00135v3.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2306.17723v4","updated":"2023-08-14T13:41:48Z","published":"2023-06-30T15:11:00Z","title":"FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis","summary":"  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis\nwith its remarkable quality of rendered images and simple architecture.\nAlthough NeRF has been developed in various directions improving continuously\nits performance, the necessity of a dense set of multi-view images still exists\nas a stumbling block to progress for practical application. In this work, we\npropose FlipNeRF, a novel regularization method for few-shot novel view\nsynthesis by utilizing our proposed flipped reflection rays. The flipped\nreflection rays are explicitly derived from the input ray directions and\nestimated normal vectors, and play a role of effective additional training rays\nwhile enabling to estimate more accurate surface normals and learn the 3D\ngeometry effectively. Since the surface normal and the scene depth are both\nderived from the estimated densities along a ray, the accurate surface normal\nleads to more exact depth estimation, which is a key factor for few-shot novel\nview synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss\nand Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more\nreliable outputs with reducing floating artifacts effectively across the\ndifferent scene structures, and enhance the feature-level consistency between\nthe pair of the rays cast toward the photo-consistent pixels without any\nadditional feature extractor, respectively. Our FlipNeRF achieves the SOTA\nperformance on the multiple benchmarks across all the scenarios.\n","authors":["Seunghyeon Seo","Yeonjin Chang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.17723v4.pdf","comment":"ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/"},{"id":"http://arxiv.org/abs/2302.13084v2","updated":"2023-08-14T13:18:34Z","published":"2023-02-25T14:09:52Z","title":"RemoteNet: Remote Sensing Image Segmentation Network based on\n  Global-Local Information","summary":"  Remotely captured images possess an immense scale and object appearance\nvariability due to the complex scene. It becomes challenging to capture the\nunderlying attributes in the global and local context for their segmentation.\nExisting networks struggle to capture the inherent features due to the\ncluttered background. To address these issues, we propose a remote sensing\nimage segmentation network, RemoteNet, for semantic segmentation of remote\nsensing images. We capture the global and local features by leveraging the\nbenefits of the transformer and convolution mechanisms. RemoteNet is an\nencoder-decoder design that uses multi-scale features. We construct an\nattention map module to generate channel-wise attention scores for fusing these\nfeatures. We construct a global-local transformer block (GLTB) in the decoder\nnetwork to support learning robust representations during a decoding phase.\nFurther, we designed a feature refinement module to refine the fused output of\nthe shallow stage encoder feature and the deepest GLTB feature of the decoder.\nExperimental findings on the two public datasets show the effectiveness of the\nproposed RemoteNet.\n","authors":["Satyawant Kumar","Abhishek Kumar","Dong-Gyu Lee"],"pdf_url":"https://arxiv.org/pdf/2302.13084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07126v1","updated":"2023-08-14T13:13:50Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":"  Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07123v1","updated":"2023-08-14T13:10:48Z","published":"2023-08-14T13:10:48Z","title":"An Outlook into the Future of Egocentric Vision","summary":"  What will the future be? We wonder! In this survey, we explore the gap\nbetween current research in egocentric vision and the ever-anticipated future,\nwhere wearable computing, with outward facing cameras and digital overlays, is\nexpected to be integrated in our every day lives. To understand this gap, the\narticle starts by envisaging the future through character-based stories,\nshowcasing through examples the limitations of current technology. We then\nprovide a mapping between this future and previously defined research tasks.\nFor each task, we survey its seminal works, current state-of-the-art\nmethodologies and available datasets, then reflect on shortcomings that limit\nits applicability to future research. Note that this survey focuses on software\nmodels for egocentric vision, independent of any specific hardware. The paper\nconcludes with recommendations for areas of immediate explorations so as to\nunlock our path to the future always-on, personalised and life-enhancing\negocentric vision.\n","authors":["Chiara Plizzari","Gabriele Goletto","Antonino Furnari","Siddhant Bansal","Francesco Ragusa","Giovanni Maria Farinella","Dima Damen","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2308.07123v1.pdf","comment":"We invite comments, suggestions and corrections here:\n  https://openreview.net/forum?id=V3974SUk1w"},{"id":"http://arxiv.org/abs/2308.07119v1","updated":"2023-08-14T12:58:02Z","published":"2023-08-14T12:58:02Z","title":"On the Importance of Spatial Relations for Few-shot Action Recognition","summary":"  Deep learning has achieved great success in video recognition, yet still\nstruggles to recognize novel actions when faced with only a few examples. To\ntackle this challenge, few-shot action recognition methods have been proposed\nto transfer knowledge from a source dataset to a novel target dataset with only\none or a few labeled videos. However, existing methods mainly focus on modeling\nthe temporal relations between the query and support videos while ignoring the\nspatial relations. In this paper, we find that the spatial misalignment between\nobjects also occurs in videos, notably more common than the temporal\ninconsistency. We are thus motivated to investigate the importance of spatial\nrelations and propose a more accurate few-shot action recognition method that\nleverages both spatial and temporal information. Particularly, a novel Spatial\nAlignment Cross Transformer (SA-CT) which learns to re-adjust the spatial\nrelations and incorporates the temporal information is contributed. Experiments\nreveal that, even without using any temporal information, the performance of\nSA-CT is comparable to temporal based methods on 3/4 benchmarks. To further\nincorporate the temporal information, we propose a simple yet effective\nTemporal Mixer module. The Temporal Mixer enhances the video representation and\nimproves the performance of the full SA-CT model, achieving very competitive\nresults. In this work, we also exploit large-scale pretrained models for\nfew-shot action recognition, providing useful insights for this research\ndirection.\n","authors":["Yilun Zhang","Yuqian Fu","Xingjun Ma","Lizhe Qi","Jingjing Chen","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07110v1","updated":"2023-08-14T12:49:39Z","published":"2023-08-14T12:49:39Z","title":"SCSC: Spatial Cross-scale Convolution Module to Strengthen both CNNs and\n  Transformers","summary":"  This paper presents a module, Spatial Cross-scale Convolution (SCSC), which\nis verified to be effective in improving both CNNs and Transformers. Nowadays,\nCNNs and Transformers have been successful in a variety of tasks. Especially\nfor Transformers, increasing works achieve state-of-the-art performance in the\ncomputer vision community. Therefore, researchers start to explore the\nmechanism of those architectures. Large receptive fields, sparse connections,\nweight sharing, and dynamic weight have been considered keys to designing\neffective base models. However, there are still some issues to be addressed:\nlarge dense kernels and self-attention are inefficient, and large receptive\nfields make it hard to capture local features. Inspired by the above analyses\nand to solve the mentioned problems, in this paper, we design a general module\ntaking in these design keys to enhance both CNNs and Transformers. SCSC\nintroduces an efficient spatial cross-scale encoder and spatial embed module to\ncapture assorted features in one layer. On the face recognition task,\nFaceResNet with SCSC can improve 2.7% with 68% fewer FLOPs and 79% fewer\nparameters. On the ImageNet classification task, Swin Transformer with SCSC can\nachieve even better performance with 22% fewer FLOPs, and ResNet with CSCS can\nimprove 5.3% with similar complexity. Furthermore, a traditional network (e.g.,\nResNet) embedded with SCSC can match Swin Transformer's performance.\n","authors":["Xijun Wang","Xiaojie Chu","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07110v1.pdf","comment":"ICCV2023 Workshop (New Ideas in Vision Transformers)"},{"id":"http://arxiv.org/abs/2308.05667v2","updated":"2023-08-14T12:49:28Z","published":"2023-08-10T16:10:54Z","title":"2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration\n  between Images and Point Clouds","summary":"  The commonly adopted detect-then-match approach to registration finds\ndifficulties in the cross-modality cases due to the incompatible keypoint\ndetection and inconsistent feature description. We propose, 2D3D-MATR, a\ndetection-free method for accurate and robust registration between images and\npoint clouds. Our method adopts a coarse-to-fine pipeline where it first\ncomputes coarse correspondences between downsampled patches of the input image\nand the point cloud and then extends them to form dense correspondences between\npixels and points within the patch region. The coarse-level patch matching is\nbased on transformer which jointly learns global contextual constraints with\nself-attention and cross-modality correlations with cross-attention. To resolve\nthe scale ambiguity in patch matching, we construct a multi-scale pyramid for\neach image patch and learn to find for each point patch the best matching image\npatch at a proper resolution level. Extensive experiments on two public\nbenchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art\nP2-Net by around $20$ percentage points on inlier ratio and over $10$ points on\nregistration recall. Our code and models are available at\nhttps://github.com/minhaolee/2D3DMATR.\n","authors":["Minhao Li","Zheng Qin","Zhirui Gao","Renjiao Yi","Chenyang Zhu","Yulan Guo","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.05667v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03061v2","updated":"2023-08-14T12:44:57Z","published":"2023-08-06T09:09:17Z","title":"InterTracker: Discovering and Tracking General Objects Interacting with\n  Hands in the Wild","summary":"  Understanding human interaction with objects is an important research topic\nfor embodied Artificial Intelligence and identifying the objects that humans\nare interacting with is a primary problem for interaction understanding.\nExisting methods rely on frame-based detectors to locate interacting objects.\nHowever, this approach is subjected to heavy occlusions, background clutter,\nand distracting objects. To address the limitations, in this paper, we propose\nto leverage spatio-temporal information of hand-object interaction to track\ninteractive objects under these challenging cases. Without prior knowledge of\nthe general objects to be tracked like object tracking problems, we first\nutilize the spatial relation between hands and objects to adaptively discover\nthe interacting objects from the scene. Second, the consistency and continuity\nof the appearance of objects between successive frames are exploited to track\nthe objects. With this tracking formulation, our method also benefits from\ntraining on large-scale general object-tracking datasets. We further curate a\nvideo-level hand-object interaction dataset for testing and evaluation from\n100DOH. The quantitative results demonstrate that our proposed method\noutperforms the state-of-the-art methods. Specifically, in scenes with\ncontinuous interaction with different objects, we achieve an impressive\nimprovement of about 10% as evaluated using the Average Precision (AP) metric.\nOur qualitative findings also illustrate that our method can produce more\ncontinuous trajectories for interacting objects.\n","authors":["Yanyan Shao","Qi Ye","Wenhan Luo","Kaihao Zhang","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03061v2.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2308.07106v1","updated":"2023-08-14T12:38:43Z","published":"2023-08-14T12:38:43Z","title":"Checklist to Transparently Define Test Oracles for TP, FP, and FN\n  Objects in Automated Driving","summary":"  Popular test oracles for the perception subsystem of driving automation\nsystems identify true-positive (TP), false-positive (FP), and false-negative\n(FN) objects. Oracle transparency is needed for comparing test results and for\nsafety cases. To date, there exists a common notion of TPs, FPs, and FNs in the\nfield, but apparently no published way to comprehensively define their oracles.\nTherefore, this paper provides a checklist of functional aspects and\nimplementation details that affect the oracle behavior. Besides labeling\npolicies of the test set, we cover fields of view, occlusion handling,\nsafety-relevant areas, matching criteria, temporal and probabilistic issues,\nand further aspects. Even though our checklist can hardly be formalized, it can\nhelp practitioners maximize the transparency of their oracles, which, in turn,\nmakes statements on object perception more reliable and comparable.\n","authors":["Michael Hoss"],"pdf_url":"https://arxiv.org/pdf/2308.07106v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.09241v3","updated":"2023-08-14T12:35:57Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n  Unexploitable Data with Learnable Examples","summary":"  Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07104v1","updated":"2023-08-14T12:35:39Z","published":"2023-08-14T12:35:39Z","title":"FocusFlow: Boosting Key-Points Optical Flow Estimation for Autonomous\n  Driving","summary":"  Key-point-based scene understanding is fundamental for autonomous driving\napplications. At the same time, optical flow plays an important role in many\nvision tasks. However, due to the implicit bias of equal attention on all\npoints, classic data-driven optical flow estimation methods yield less\nsatisfactory performance on key points, limiting their implementations in\nkey-point-critical safety-relevant scenarios. To address these issues, we\nintroduce a points-based modeling method that requires the model to learn\nkey-point-related priors explicitly. Based on the modeling method, we present\nFocusFlow, a framework consisting of 1) a mix loss function combined with a\nclassic photometric loss function and our proposed Conditional Point Control\nLoss (CPCL) function for diverse point-wise supervision; 2) a conditioned\ncontrolling model which substitutes the conventional feature encoder by our\nproposed Condition Control Encoder (CCE). CCE incorporates a Frame Feature\nEncoder (FFE) that extracts features from frames, a Condition Feature Encoder\n(CFE) that learns to control the feature extraction behavior of FFE from input\nmasks containing information of key points, and fusion modules that transfer\nthe controlling information between FFE and CFE. Our FocusFlow framework shows\noutstanding performance with up to +44.5% precision improvement on various key\npoints such as ORB, SIFT, and even learning-based SiLK, along with exceptional\nscalability for most existing data-driven optical flow methods like PWC-Net,\nRAFT, and FlowFormer. Notably, FocusFlow yields competitive or superior\nperformances rivaling the original models on the whole frame. The source code\nwill be available at https://github.com/ZhonghuaYi/FocusFlow_official.\n","authors":["Zhonghua Yi","Hao Shi","Kailun Yang","Qi Jiang","Yaozu Ye","Ze Wang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07104v1.pdf","comment":"The source code of FocusFlow will be available at\n  https://github.com/ZhonghuaYi/FocusFlow_official"},{"id":"http://arxiv.org/abs/2302.03665v4","updated":"2023-08-14T12:31:19Z","published":"2023-02-07T18:34:59Z","title":"HumanMAC: Masked Motion Completion for Human Motion Prediction","summary":"  Human motion prediction is a classical problem in computer vision and\ncomputer graphics, which has a wide range of practical applications. Previous\neffects achieve great empirical performance based on an encoding-decoding\nstyle. The methods of this style work by first encoding previous motions to\nlatent representations and then decoding the latent representations into\npredicted motions. However, in practice, they are still unsatisfactory due to\nseveral issues, including complicated loss constraints, cumbersome training\nprocesses, and scarce switch of different categories of motions in prediction.\nIn this paper, to address the above issues, we jump out of the foregoing style\nand propose a novel framework from a new perspective. Specifically, our\nframework works in a masked completion fashion. In the training stage, we learn\na motion diffusion model that generates motions from random noise. In the\ninference stage, with a denoising procedure, we make motion prediction\nconditioning on observed motions to output more continuous and controllable\npredictions. The proposed framework enjoys promising algorithmic properties,\nwhich only needs one loss in optimization and is trained in an end-to-end\nmanner. Additionally, it accomplishes the switch of different categories of\nmotions effectively, which is significant in realistic tasks, e.g., the\nanimation task. Comprehensive experiments on benchmarks confirm the superiority\nof the proposed framework. The project page is available at\nhttps://lhchen.top/Human-MAC.\n","authors":["Ling-Hao Chen","Jiawei Zhang","Yewen Li","Yiren Pang","Xiaobo Xia","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.03665v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":"  This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.09011v2","updated":"2023-08-14T12:16:53Z","published":"2023-06-15T10:12:02Z","title":"CAD-Estate: Large-scale CAD Model Annotation in RGB Videos","summary":"  We propose a method for annotating videos of complex multi-object scenes with\na globally-consistent 3D representation of the objects. We annotate each object\nwith a CAD model from a database, and place it in the 3D coordinate frame of\nthe scene with a 9-DoF pose transformation. Our method is semi-automatic and\nworks on commonly-available RGB videos, without requiring a depth sensor. Many\nsteps are performed automatically, and the tasks performed by humans are\nsimple, well-specified, and require only limited reasoning in 3D. This makes\nthem feasible for crowd-sourcing and has allowed us to construct a large-scale\ndataset by annotating real-estate videos from YouTube. Our dataset CAD-Estate\noffers 101k instances of 12k unique CAD models placed in the 3D representations\nof 20k videos. In comparison to Scan2CAD, the largest existing dataset with CAD\nmodel annotations on real scenes, CAD-Estate has 7x more instances and 4x more\nunique CAD models. We showcase the benefits of pre-training a Mask2CAD model on\nCAD-Estate for the task of automatic 3D object reconstruction and pose\nestimation, demonstrating that it leads to performance improvements on the\npopular Scan2CAD benchmark. The dataset is available at\nhttps://github.com/google-research/cad-estate.\n","authors":["Kevis-Kokitsi Maninis","Stefan Popov","Matthias Nießner","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09011v2.pdf","comment":"Project page: https://github.com/google-research/cad-estate"},{"id":"http://arxiv.org/abs/2210.06551v5","updated":"2023-08-14T12:11:35Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n  Representations","summary":"  We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v5.pdf","comment":"ICCV 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2308.07092v1","updated":"2023-08-14T11:56:39Z","published":"2023-08-14T11:56:39Z","title":"Masked Motion Predictors are Strong 3D Action Representation Learners","summary":"  In 3D human action recognition, limited supervised data makes it challenging\nto fully tap into the modeling potential of powerful networks such as\ntransformers. As a result, researchers have been actively investigating\neffective self-supervised pre-training strategies. In this work, we show that\ninstead of following the prevalent pretext task to perform masked\nself-component reconstruction in human joints, explicit contextual motion\nmodeling is key to the success of learning effective feature representation for\n3D action recognition. Formally, we propose the Masked Motion Prediction (MAMP)\nframework. To be specific, the proposed MAMP takes as input the masked\nspatio-temporal skeleton sequence and predicts the corresponding temporal\nmotion of the masked human joints. Considering the high temporal redundancy of\nthe skeleton sequence, in our MAMP, the motion information also acts as an\nempirical semantic richness prior that guide the masking process, promoting\nbetter attention to semantically rich temporal regions. Extensive experiments\non NTU-60, NTU-120, and PKU-MMD datasets show that the proposed MAMP\npre-training substantially improves the performance of the adopted vanilla\ntransformer, achieving state-of-the-art results without bells and whistles. The\nsource code of our MAMP is available at https://github.com/maoyunyao/MAMP.\n","authors":["Yunyao Mao","Jiajun Deng","Wengang Zhou","Yao Fang","Wanli Ouyang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.07092v1.pdf","comment":"To appear in ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12398v3","updated":"2023-08-14T11:47:41Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n  Transformers","summary":"  Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07078v1","updated":"2023-08-14T11:21:47Z","published":"2023-08-14T11:21:47Z","title":"ICPC: Instance-Conditioned Prompting with Contrastive Learning for\n  Semantic Segmentation","summary":"  Modern supervised semantic segmentation methods are usually finetuned based\non the supervised or self-supervised models pre-trained on ImageNet. Recent\nwork shows that transferring the knowledge from CLIP to semantic segmentation\nvia prompt learning can achieve promising performance. The performance boost\ncomes from the feature enhancement with multimodal alignment, i.e., the dot\nproduct between vision and text embeddings. However, how to improve the\nmultimodal alignment for better transfer performance in dense tasks remains\nunderexplored. In this work, we focus on improving the quality of vision-text\nalignment from two aspects of prompting design and loss function, and present\nan instance-conditioned prompting with contrastive learning (ICPC) framework.\nFirst, compared with the static prompt designs, we reveal that dynamic\nprompting conditioned on image content can more efficiently utilize the text\nencoder for complex dense tasks. Second, we propose an align-guided contrastive\nloss to refine the alignment of vision and text embeddings. We further propose\nlightweight multi-scale alignment for better performance. Extensive experiments\non three large-scale datasets (ADE20K, COCO-Stuff10k, and ADE20K-Full)\ndemonstrate that ICPC brings consistent improvements across diverse backbones.\nTaking ResNet-50 as an example, ICPC outperforms the state-of-the-art\ncounterpart by 1.71%, 1.05%, and 1.41% mIoU on the three datasets,\nrespectively.\n","authors":["Chaohui Yu","Qiang Zhou","Zhibin Wang","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12280v2","updated":"2023-08-14T11:16:44Z","published":"2023-07-23T10:16:47Z","title":"Downstream-agnostic Adversarial Examples","summary":"  Self-supervised learning usually uses a large amount of unlabeled data to\npre-train an encoder which can be used as a general-purpose feature extractor,\nsuch that downstream users only need to perform fine-tuning operations to enjoy\nthe benefit of \"large model\". Despite this promising prospect, the security of\npre-trained encoder has not been thoroughly investigated yet, especially when\nthe pre-trained encoder is publicly available for commercial use.\n  In this paper, we propose AdvEncoder, the first framework for generating\ndownstream-agnostic universal adversarial examples based on the pre-trained\nencoder. AdvEncoder aims to construct a universal adversarial perturbation or\npatch for a set of natural images that can fool all the downstream tasks\ninheriting the victim pre-trained encoder. Unlike traditional adversarial\nexample works, the pre-trained encoder only outputs feature vectors rather than\nclassification labels. Therefore, we first exploit the high frequency component\ninformation of the image to guide the generation of adversarial examples. Then\nwe design a generative attack framework to construct adversarial\nperturbations/patches by learning the distribution of the attack surrogate\ndataset to improve their attack success rates and transferability. Our results\nshow that an attacker can successfully attack downstream tasks without knowing\neither the pre-training dataset or the downstream dataset. We also tailor four\ndefenses for pre-trained encoders, the results of which further prove the\nattack ability of AdvEncoder.\n","authors":["Ziqi Zhou","Shengshan Hu","Ruizhi Zhao","Qian Wang","Leo Yu Zhang","Junhui Hou","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.12280v2.pdf","comment":"This paper has been accepted by the International Conference on\n  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)"},{"id":"http://arxiv.org/abs/2308.07072v1","updated":"2023-08-14T11:06:28Z","published":"2023-08-14T11:06:28Z","title":"Teeth And Root Canals Segmentation Using ZXYFormer With Uncertainty\n  Guidance And Weight Transfer","summary":"  This study attempts to segment teeth and root-canals simultaneously from CBCT\nimages, but there are very challenging problems in this process. First, the\nclinical CBCT image data is very large (e.g., 672 *688 * 688), and the use of\ndownsampling operation will lose useful information about teeth and root\ncanals. Second, teeth and root canals are very different in morphology, and it\nis difficult for a simple network to identify them precisely. In addition,\nthere are weak edges at the tooth, between tooth and root canal, which makes it\nvery difficult to segment such weak edges. To this end, we propose a\ncoarse-to-fine segmentation method based on inverse feature fusion transformer\nand uncertainty estimation to address above challenging problems. First, we use\nthe downscaled volume data (e.g., 128 * 128 * 128) to conduct coarse\nsegmentation and map it to the original volume to obtain the area of teeth and\nroot canals. Then, we design a transformer with reverse feature fusion, which\ncan bring better segmentation effect of different morphological objects by\ntransferring deeper features to shallow features. Finally, we design an\nauxiliary branch to calculate and refine the difficult areas in order to\nimprove the weak edge segmentation performance of teeth and root canals.\nThrough the combined tooth and root canal segmentation experiment of 157\nclinical high-resolution CBCT data, it is verified that the proposed method is\nsuperior to the existing tooth or root canal segmentation methods.\n","authors":["Shangxuan Li","Yu Du","Li Ye","Chichi Li","Yanshu Fang","Cheng Wang","Wu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07070v1","updated":"2023-08-14T11:05:37Z","published":"2023-08-14T11:05:37Z","title":"A Local Iterative Approach for the Extraction of 2D Manifolds from\n  Strongly Curved and Folded Thin-Layer Structures","summary":"  Ridge surfaces represent important features for the analysis of 3-dimensional\n(3D) datasets in diverse applications and are often derived from varying\nunderlying data including flow fields, geological fault data, and point data,\nbut they can also be present in the original scalar images acquired using a\nplethora of imaging techniques. Our work is motivated by the analysis of image\ndata acquired using micro-computed tomography (Micro-CT) of ancient, rolled and\nfolded thin-layer structures such as papyrus, parchment, and paper as well as\nsilver and lead sheets. From these documents we know that they are\n2-dimensional (2D) in nature. Hence, we are particularly interested in\nreconstructing 2D manifolds that approximate the document's structure. The\nimage data from which we want to reconstruct the 2D manifolds are often very\nnoisy and represent folded, densely-layered structures with many artifacts,\nsuch as ruptures or layer splitting and merging. Previous ridge-surface\nextraction methods fail to extract the desired 2D manifold for such challenging\ndata. We have therefore developed a novel method to extract 2D manifolds. The\nproposed method uses a local fast marching scheme in combination with a\nseparation of the region covered by fast marching into two sub-regions. The 2D\nmanifold of interest is then extracted as the surface separating the two\nsub-regions. The local scheme can be applied for both automatic propagation as\nwell as interactive analysis. We demonstrate the applicability and robustness\nof our method on both artificial data as well as real-world data including\nfolded silver and papyrus sheets.\n","authors":["Nicolas Klenert","Verena Lepper","Daniel Baum"],"pdf_url":"https://arxiv.org/pdf/2308.07070v1.pdf","comment":"16 pages, 21 figures, to be published in IEEE Transactions on\n  Visualization and Computer Graphics"},{"id":"http://arxiv.org/abs/2203.10496v3","updated":"2023-08-14T10:45:48Z","published":"2022-03-20T09:02:13Z","title":"NeuralReshaper: Single-image Human-body Retouching with Deep Neural\n  Networks","summary":"  In this paper, we present NeuralReshaper, a novel method for semantic\nreshaping of human bodies in single images using deep generative networks. To\nachieve globally coherent reshaping effects, our approach follows a\nfit-then-reshape pipeline, which first fits a parametric 3D human model to a\nsource human image and then reshapes the fitted 3D model with respect to\nuser-specified semantic attributes. Previous methods rely on image warping to\ntransfer 3D reshaping effects to the entire image domain and thus often cause\ndistortions in both foreground and background. In contrast, we resort to\ngenerative adversarial nets conditioned on the source image and a 2D warping\nfield induced by the reshaped 3D model, to achieve more realistic reshaping\nresults. Specifically, we separately encode the foreground and background\ninformation in the source image using a two-headed UNet-like generator, and\nguide the information flow from the foreground branch to the background branch\nvia feature space warping. Furthermore, to deal with the lack-of-data problem\nthat no paired data exist (i.e., the same human bodies in varying shapes), we\nintroduce a novel self-supervised strategy to train our network. Unlike\nprevious methods that often require manual efforts to correct undesirable\nartifacts caused by incorrect body-to-image fitting, our method is fully\nautomatic. Extensive experiments on both indoor and outdoor datasets\ndemonstrate the superiority of our method over previous approaches.\n","authors":["Beijia Chen","Yuefan Shen","Hongbo Fu","Xiang Chen","Kun Zhou","Youyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2203.10496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07052v1","updated":"2023-08-14T10:23:25Z","published":"2023-08-14T10:23:25Z","title":"Diagnosis of Scalp Disorders using Machine Learning and Deep Learning\n  Approach -- A Review","summary":"  The morbidity of scalp diseases is minuscule compared to other diseases, but\nthe impact on the patient's life is enormous. It is common for people to\nexperience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,\nAlopecia and Atopic-Dermatitis. In accordance with WHO research, approximately\n70% of adults have problems with their scalp. It has been demonstrated in\ndescriptive research that hair quality is impaired by impaired scalp, but these\nimpacts are reversible with early diagnosis and treatment. Deep Learning\nadvances have demonstrated the effectiveness of CNN paired with FCN in\ndiagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp\ninspection and diagnosis system, an imaging microscope and a trained model are\ncombined with an app that classifies scalp disorders accurately with an average\nprecision of 97.41%- 99.09%. Another research dealt with classifying the\nPsoriasis using the CNN with an accuracy of 82.9%. As part of another study, an\nML based algorithm was also employed. It accurately classified the healthy\nscalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN\nalgorithms. Using deep learning models to diagnose scalp related diseases has\nimproved due to advancements i computation capabilities and computer vision,\nbut there remains a wide horizon for further improvements.\n","authors":["Hrishabh Tiwari","Jatin Moolchandani","Shamla Mantri"],"pdf_url":"https://arxiv.org/pdf/2308.07052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07050v1","updated":"2023-08-14T10:21:06Z","published":"2023-08-14T10:21:06Z","title":"Survey on video anomaly detection in dynamic scenes with moving cameras","summary":"  The increasing popularity of compact and inexpensive cameras, e.g.~dash\ncameras, body cameras, and cameras equipped on robots, has sparked a growing\ninterest in detecting anomalies within dynamic scenes recorded by moving\ncameras. However, existing reviews primarily concentrate on Video Anomaly\nDetection (VAD) methods assuming static cameras. The VAD literature with moving\ncameras remains fragmented, lacking comprehensive reviews to date. To address\nthis gap, we endeavor to present the first comprehensive survey on Moving\nCamera Video Anomaly Detection (MC-VAD). We delve into the research papers\nrelated to MC-VAD, critically assessing their limitations and highlighting\nassociated challenges. Our exploration encompasses three application domains:\nsecurity, urban transportation, and marine environments, which in turn cover\nsix specific tasks. We compile an extensive list of 25 publicly-available\ndatasets spanning four distinct environments: underwater, water surface,\nground, and aerial. We summarize the types of anomalies these datasets\ncorrespond to or contain, and present five main categories of approaches for\ndetecting such anomalies. Lastly, we identify future research directions and\ndiscuss novel contributions that could advance the field of MC-VAD. With this\nsurvey, we aim to offer a valuable reference for researchers and practitioners\nstriving to develop and advance state-of-the-art MC-VAD methods.\n","authors":["Runyu Jiao","Yi Wan","Fabio Poiesi","Yiming Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07050v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.07039v1","updated":"2023-08-14T10:02:30Z","published":"2023-08-14T10:02:30Z","title":"The minimal computational substrate of fluid intelligence","summary":"  The quantification of cognitive powers rests on identifying a behavioural\ntask that depends on them. Such dependence cannot be assured, for the powers a\ntask invokes cannot be experimentally controlled or constrained a priori,\nresulting in unknown vulnerability to failure of specificity and\ngeneralisability. Evaluating a compact version of Raven's Advanced Progressive\nMatrices (RAPM), a widely used clinical test of fluid intelligence, we show\nthat LaMa, a self-supervised artificial neural network trained solely on the\ncompletion of partially masked images of natural environmental scenes, achieves\nhuman-level test scores a prima vista, without any task-specific inductive bias\nor training. Compared with cohorts of healthy and focally lesioned\nparticipants, LaMa exhibits human-like variation with item difficulty, and\nproduces errors characteristic of right frontal lobe damage under degradation\nof its ability to integrate global spatial patterns. LaMa's narrow training and\nlimited capacity -- comparable to the nervous system of the fruit fly --\nsuggest RAPM may be open to computationally simple solutions that need not\nnecessarily invoke abstract reasoning.\n","authors":["Amy PK Nelson","Joe Mole","Guilherme Pombo","Robert J Gray","James K Ruffle","Edgar Chan","Geraint E Rees","Lisa Cipolotti","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2308.07039v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07034v1","updated":"2023-08-14T09:53:27Z","published":"2023-08-14T09:53:27Z","title":"An Inherent Trade-Off in Noisy Neural Communication with Rank-Order\n  Coding","summary":"  Rank-order coding, a form of temporal coding, has emerged as a promising\nscheme to explain the rapid ability of the mammalian brain. Owing to its speed\nas well as efficiency, rank-order coding is increasingly gaining interest in\ndiverse research areas beyond neuroscience. However, much uncertainty still\nexists about the performance of rank-order coding under noise. Herein we show\nwhat information rates are fundamentally possible and what trade-offs are at\nstake. An unexpected finding in this paper is the emergence of a special class\nof errors that, in a regime, increase with less noise.\n","authors":["Ibrahim Alsolami","Tomoki Fukai"],"pdf_url":"https://arxiv.org/pdf/2308.07034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07032v1","updated":"2023-08-14T09:45:28Z","published":"2023-08-14T09:45:28Z","title":"S3IM: Stochastic Structural SIMilarity and Its Unreasonable\n  Effectiveness for Neural Fields","summary":"  Recently, Neural Radiance Field (NeRF) has shown great success in rendering\nnovel-view images of a given scene by learning an implicit representation with\nonly posed RGB images. NeRF and relevant neural field methods (e.g., neural\nsurface representation) typically optimize a point-wise loss and make\npoint-wise predictions, where one data point corresponds to one pixel.\nUnfortunately, this line of research failed to use the collective supervision\nof distant pixels, although it is known that pixels in an image or scene can\nprovide rich structural information. To the best of our knowledge, we are the\nfirst to design a nonlocal multiplex training paradigm for NeRF and relevant\nneural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss\nthat processes multiple data points as a whole set instead of process multiple\ninputs independently. Our extensive experiments demonstrate the unreasonable\neffectiveness of S3IM in improving NeRF and neural surface representation for\nnearly free. The improvements of quality metrics can be particularly\nsignificant for those relatively difficult tasks: e.g., the test MSE loss\nunexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view\nsynthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance\nreduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is\nconsistently robust even with sparse inputs, corrupted images, and dynamic\nscenes.\n","authors":["Zeke Xie","Xindi Yang","Yujie Yang","Qi Sun","Yixiang Jiang","Haoran Wang","Yunfeng Cai","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07032v1.pdf","comment":"ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14\n  pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2308.07026v1","updated":"2023-08-14T09:29:22Z","published":"2023-08-14T09:29:22Z","title":"AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal\n  Contrastive Learning","summary":"  Multimodal contrastive learning aims to train a general-purpose feature\nextractor, such as CLIP, on vast amounts of raw, unlabeled paired image-text\ndata. This can greatly benefit various complex downstream tasks, including\ncross-modal image-text retrieval and image classification. Despite its\npromising prospect, the security issue of cross-modal pre-trained encoder has\nnot been fully explored yet, especially when the pre-trained encoder is\npublicly available for commercial use.\n  In this work, we propose AdvCLIP, the first attack framework for generating\ndownstream-agnostic adversarial examples based on cross-modal pre-trained\nencoders. AdvCLIP aims to construct a universal adversarial patch for a set of\nnatural images that can fool all the downstream tasks inheriting the victim\ncross-modal pre-trained encoder. To address the challenges of heterogeneity\nbetween different modalities and unknown downstream tasks, we first build a\ntopological graph structure to capture the relevant positions between target\nsamples and their neighbors. Then, we design a topology-deviation based\ngenerative adversarial network to generate a universal adversarial patch. By\nadding the patch to images, we minimize their embeddings similarity to\ndifferent modality and perturb the sample distribution in the feature space,\nachieving unviersal non-targeted attacks. Our results demonstrate the excellent\nattack performance of AdvCLIP on two types of downstream tasks across eight\ndatasets. We also tailor three popular defenses to mitigate AdvCLIP,\nhighlighting the need for new defense mechanisms to defend cross-modal\npre-trained encoders.\n","authors":["Ziqi Zhou","Shengshan Hu","Minghui Li","Hangtao Zhang","Yechao Zhang","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2308.07026v1.pdf","comment":"This paper has been accepted by the ACM International Conference on\n  Multimedia (ACM MM '23, October 29-November 3, 2023, Ottawa, ON, Canada)"},{"id":"http://arxiv.org/abs/2308.07024v1","updated":"2023-08-14T09:19:26Z","published":"2023-08-14T09:19:26Z","title":"PGT-Net: Progressive Guided Multi-task Neural Network for Small-area Wet\n  Fingerprint Denoising and Recognition","summary":"  Fingerprint recognition on mobile devices is an important method for identity\nverification. However, real fingerprints usually contain sweat and moisture\nwhich leads to poor recognition performance. In addition, for rolling out\nslimmer and thinner phones, technology companies reduce the size of recognition\nsensors by embedding them with the power button. Therefore, the limited size of\nfingerprint data also increases the difficulty of recognition. Denoising the\nsmall-area wet fingerprint images to clean ones becomes crucial to improve\nrecognition performance. In this paper, we propose an end-to-end trainable\nprogressive guided multi-task neural network (PGT-Net). The PGT-Net includes a\nshared stage and specific multi-task stages, enabling the network to train\nbinary and non-binary fingerprints sequentially. The binary information is\nregarded as guidance for output enhancement which is enriched with the ridge\nand valley details. Moreover, a novel residual scaling mechanism is introduced\nto stabilize the training process. Experiment results on the FW9395 and\nFT-lightnoised dataset provided by FocalTech shows that PGT-Net has promising\nperformance on the wet-fingerprint denoising and significantly improves the\nfingerprint recognition rate (FRR). On the FT-lightnoised dataset, the FRR of\nfingerprint recognition can be declined from 17.75% to 4.47%. On the FW9395\ndataset, the FRR of fingerprint recognition can be declined from 9.45% to\n1.09%.\n","authors":["Yu-Ting Li","Ching-Te Chiu","An-Ting Hsieh","Mao-Hsiu Hsu","Long Wenyong","Jui-Min Hsu"],"pdf_url":"https://arxiv.org/pdf/2308.07024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07017v1","updated":"2023-08-14T09:06:21Z","published":"2023-08-14T09:06:21Z","title":"Contrastive Bi-Projector for Unsupervised Domain Adaption","summary":"  This paper proposes a novel unsupervised domain adaption (UDA) method based\non contrastive bi-projector (CBP), which can improve the existing UDA methods.\nIt is called CBPUDA here, which effectively promotes the feature extractors\n(FEs) to reduce the generation of ambiguous features for classification and\ndomain adaption. The CBP differs from traditional bi-classifier-based methods\nat that these two classifiers are replaced with two projectors of performing a\nmapping from the input feature to two distinct features. These two projectors\nand the FEs in the CBPUDA can be trained adversarially to obtain more refined\ndecision boundaries so that it can possess powerful classification performance.\nTwo properties of the proposed loss function are analyzed here. The first\nproperty is to derive an upper bound of joint prediction entropy, which is used\nto form the proposed loss function, contrastive discrepancy (CD) loss. The CD\nloss takes the advantages of the contrastive learning and the bi-classifier.\nThe second property is to analyze the gradient of the CD loss and then overcome\nthe drawback of the CD loss. The result of the second property is utilized in\nthe development of the gradient scaling (GS) scheme in this paper. The GS\nscheme can be exploited to tackle the unstable problem of the CD loss because\ntraining the CBPUDA requires using contrastive learning and adversarial\nlearning at the same time. Therefore, using the CD loss with the GS scheme\novercomes the problem mentioned above to make features more compact for\nintra-class and distinguishable for inter-class. Experimental results express\nthat the CBPUDA is superior to conventional UDA methods under consideration in\nthis paper for UDA and fine-grained UDA tasks.\n","authors":["Lin-Chieh Huang","Hung-Hsu Tsai"],"pdf_url":"https://arxiv.org/pdf/2308.07017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07016v1","updated":"2023-08-14T09:04:06Z","published":"2023-08-14T09:04:06Z","title":"HPFormer: Hyperspectral image prompt object tracking","summary":"  Hyperspectral imagery contains abundant spectral information beyond the\nvisible RGB bands, providing rich discriminative details about objects in a\nscene. Leveraging such data has the potential to enhance visual tracking\nperformance. While prior hyperspectral trackers employ CNN or hybrid\nCNN-Transformer architectures, we propose a novel approach HPFormer on\nTransformers to capitalize on their powerful representation learning\ncapabilities. The core of HPFormer is a Hyperspectral Hybrid Attention (HHA)\nmodule which unifies feature extraction and fusion within one component through\ntoken interactions. Additionally, a Transform Band Module (TBM) is introduced\nto selectively aggregate spatial details and spectral signatures from the full\nhyperspectral input for injecting informative target representations. Extensive\nexperiments demonstrate state-of-the-art performance of HPFormer on benchmark\nNIR and VIS tracking datasets. Our work provides new insights into harnessing\nthe strengths of transformers and hyperspectral fusion to advance robust object\ntracking.\n","authors":["Yuedong Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01146v4","updated":"2023-08-14T08:54:43Z","published":"2023-01-03T15:11:41Z","title":"Rethinking Mobile Block for Efficient Attention-based Models","summary":"  This paper focuses on developing modern, efficient, lightweight models for\ndense predictions while trading off parameters, FLOPs, and performance.\nInverted Residual Block (IRB) serves as the infrastructure for lightweight\nCNNs, but no counterpart has been recognized by attention-based studies. This\nwork rethinks lightweight infrastructure from efficient IRB and effective\ncomponents of Transformer from a unified perspective, extending CNN-based IRB\nto attention-based models and abstracting a one-residual Meta Mobile Block\n(MMB) for lightweight model design. Following simple but effective design\ncriterion, we deduce a modern Inverted Residual Mobile Block (iRMB) and build a\nResNet-like Efficient MOdel (EMO) with only iRMB for down-stream tasks.\nExtensive experiments on ImageNet-1K, COCO2017, and ADE20K benchmarks\ndemonstrate the superiority of our EMO over state-of-the-art methods, e.g.,\nEMO-1M/2M/5M achieve 71.5, 75.1, and 78.4 Top-1 that surpass equal-order\nCNN-/Attention-based models, while trading-off the parameter, efficiency, and\naccuracy well: running 2.8-4.0x faster than EdgeNeXt on iPhone14.\n","authors":["Jiangning Zhang","Xiangtai Li","Jian Li","Liang Liu","Zhucun Xue","Boshen Zhang","Zhengkai Jiang","Tianxin Huang","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2301.01146v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07009v1","updated":"2023-08-14T08:52:41Z","published":"2023-08-14T08:52:41Z","title":"ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal\n  and Robust Vehicle Evasion","summary":"  Adversarial camouflage has garnered attention for its ability to attack\nobject detectors from any viewpoint by covering the entire object's surface.\nHowever, universality and robustness in existing methods often fall short as\nthe transferability aspect is often overlooked, thus restricting their\napplication only to a specific target with limited performance. To address\nthese challenges, we present Adversarial Camouflage for Transferable and\nIntensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage\nattack framework designed to generate universal and robust adversarial\ncamouflage capable of concealing any 3D vehicle from detectors. Our framework\nincorporates innovative techniques to enhance universality and robustness: a\nrefined texture rendering that enables common texture application to different\nvehicles without being constrained to a specific texture map, a novel stealth\nloss that renders the vehicle undetectable, and a smooth and camouflage loss to\nenhance the naturalness of the adversarial camouflage. Our extensive\nexperiments on 15 different models show that ACTIVE consistently outperforms\nexisting works on various public detectors, including the latest YOLOv7.\nNotably, our universality evaluations reveal promising transferability to other\nvehicle classes, tasks (segmentation models), and the real world, not just\nother vehicles.\n","authors":["Naufal Suryanto","Yongsu Kim","Harashta Tatimma Larasati","Hyoeun Kang","Thi-Thu-Huong Le","Yoonyoung Hong","Hunmin Yang","Se-Yoon Oh","Howon Kim"],"pdf_url":"https://arxiv.org/pdf/2308.07009v1.pdf","comment":"Accepted for ICCV 2023. Main Paper with Supplementary Material.\n  Project Page: https://islab-ai.github.io/active-iccv2023/"},{"id":"http://arxiv.org/abs/2307.06505v2","updated":"2023-08-14T08:52:02Z","published":"2023-07-13T01:05:12Z","title":"WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark\n  for Autonomous Driving on Water Surfaces","summary":"  Autonomous driving on water surfaces plays an essential role in executing\nhazardous and time-consuming missions, such as maritime surveillance, survivors\nrescue, environmental monitoring, hydrography mapping and waste cleaning. This\nwork presents WaterScenes, the first multi-task 4D radar-camera fusion dataset\nfor autonomous driving on water surfaces. Equipped with a 4D radar and a\nmonocular camera, our Unmanned Surface Vehicle (USV) proffers all-weather\nsolutions for discerning object-related information, including color, shape,\ntexture, range, velocity, azimuth, and elevation. Focusing on typical static\nand dynamic objects on water surfaces, we label the camera images and radar\npoint clouds at pixel-level and point-level, respectively. In addition to basic\nperception tasks, such as object detection, instance segmentation and semantic\nsegmentation, we also provide annotations for free-space segmentation and\nwaterline segmentation. Leveraging the multi-task and multi-modal data, we\nconduct benchmark experiments on the uni-modality of radar and camera, as well\nas the fused modalities. Experimental results demonstrate that 4D radar-camera\nfusion can considerably improve the accuracy and robustness of perception on\nwater surfaces, especially in adverse lighting and weather conditions.\nWaterScenes dataset is public on https://waterscenes.github.io.\n","authors":["Shanliang Yao","Runwei Guan","Zhaodong Wu","Yi Ni","Zile Huang","Zixian Zhang","Yong Yue","Weiping Ding","Eng Gee Lim","Hyungjoon Seo","Ka Lok Man","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2307.06505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07003v1","updated":"2023-08-14T08:39:09Z","published":"2023-08-14T08:39:09Z","title":"Deepbet: Fast brain extraction of T1-weighted MRI using Convolutional\n  Neural Networks","summary":"  Brain extraction in magnetic resonance imaging (MRI) data is an important\nsegmentation step in many neuroimaging preprocessing pipelines. Image\nsegmentation is one of the research fields in which deep learning had the\nbiggest impact in recent years enabling high precision segmentation with\nminimal compute. Consequently, traditional brain extraction methods are now\nbeing replaced by deep learning-based methods. Here, we used a unique dataset\ncomprising 568 T1-weighted (T1w) MR images from 191 different studies in\ncombination with cutting edge deep learning methods to build a fast,\nhigh-precision brain extraction tool called deepbet. deepbet uses LinkNet, a\nmodern UNet architecture, in a two stage prediction process. This increases its\nsegmentation performance, setting a novel state-of-the-art performance during\ncross-validation with a median Dice score (DSC) of 99.0% on unseen datasets,\noutperforming current state of the art models (DSC = 97.8% and DSC = 97.9%).\nWhile current methods are more sensitive to outliers, resulting in Dice scores\nas low as 76.5%, deepbet manages to achieve a Dice score of > 96.9% for all\nsamples. Finally, our model accelerates brain extraction by a factor of ~10\ncompared to current methods, enabling the processing of one image in ~2 seconds\non low level hardware.\n","authors":["Lukas Fisch","Stefan Zumdick","Carlotta Barkhau","Daniel Emden","Jan Ernsting","Ramona Leenings","Kelvin Sarink","Nils R. Winter","Benjamin Risse","Udo Dannlowski","Tim Hahn"],"pdf_url":"https://arxiv.org/pdf/2308.07003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01006v4","updated":"2023-08-14T08:28:32Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n  Autonomous Driving","summary":"  Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06998v1","updated":"2023-08-14T08:23:58Z","published":"2023-08-14T08:23:58Z","title":"Mutual Information-driven Triple Interaction Network for Efficient Image\n  Dehazing","summary":"  Multi-stage architectures have exhibited efficacy in image dehazing, which\nusually decomposes a challenging task into multiple more tractable sub-tasks\nand progressively estimates latent hazy-free images. Despite the remarkable\nprogress, existing methods still suffer from the following shortcomings: (1)\nlimited exploration of frequency domain information; (2) insufficient\ninformation interaction; (3) severe feature redundancy. To remedy these issues,\nwe propose a novel Mutual Information-driven Triple interaction Network\n(MITNet) based on spatial-frequency dual domain information and two-stage\narchitecture. To be specific, the first stage, named amplitude-guided haze\nremoval, aims to recover the amplitude spectrum of the hazy images for haze\nremoval. And the second stage, named phase-guided structure refined, devotes to\nlearning the transformation and refinement of the phase spectrum. To facilitate\nthe information exchange between two stages, an Adaptive Triple Interaction\nModule (ATIM) is developed to simultaneously aggregate cross-domain,\ncross-scale, and cross-stage features, where the fused features are further\nused to generate content-adaptive dynamic filters so that applying them to\nenhance global context representation. In addition, we impose the mutual\ninformation minimization constraint on paired scale encoder and decoder\nfeatures from both stages. Such an operation can effectively reduce information\nredundancy and enhance cross-stage feature complementarity. Extensive\nexperiments on multiple public datasets exhibit that our MITNet performs\nsuperior performance with lower model complexity.The code and models are\navailable at https://github.com/it-hao/MITNet.\n","authors":["Hao Shen","Zhong-Qiu Zhao","Yulun Zhang","Zhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06998v1.pdf","comment":"Accepted in ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.04980v2","updated":"2023-08-14T08:08:50Z","published":"2023-03-09T01:42:43Z","title":"Decision-BADGE: Decision-based Adversarial Batch Attack with Directional\n  Gradient Estimation","summary":"  The susceptibility of deep neural networks (DNNs) to adversarial examples has\nprompted an increase in the deployment of adversarial attacks. Image-agnostic\nuniversal adversarial perturbations (UAPs) are much more threatening, but many\nlimitations exist to implementing UAPs in real-world scenarios where only\nbinary decisions are returned. In this research, we propose Decision-BADGE, a\nnovel method to craft universal adversarial perturbations for executing\ndecision-based black-box attacks. To optimize perturbation with decisions, we\naddressed two challenges, namely the magnitude and the direction of the\ngradient. First, we use batch loss, differences from distributions of ground\ntruth, and accumulating decisions in batches to determine the magnitude of the\ngradient. This magnitude is applied in the direction of the revised\nsimultaneous perturbation stochastic approximation (SPSA) to update the\nperturbation. This simple yet efficient method can be easily extended to\nscore-based attacks as well as targeted attacks. Experimental validation across\nmultiple victim models demonstrates that the Decision-BADGE outperforms\nexisting attack methods, even image-specific and score-based attacks. In\nparticular, our proposed method shows a superior success rate with less\ntraining time. The research also shows that Decision-BADGE can successfully\ndeceive unseen victim models and accurately target specific classes.\n","authors":["Geunhyeok Yu","Minwoo Jeon","Hyoseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2303.04980v2.pdf","comment":"9 pages (7 pages except for references), 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2303.05050v2","updated":"2023-08-14T07:50:52Z","published":"2023-03-09T05:54:42Z","title":"Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric\n  Depth Estimation","summary":"  With the rapid advancements in autonomous driving and robot navigation, there\nis a growing demand for lifelong learning models capable of estimating metric\n(absolute) depth. Lifelong learning approaches potentially offer significant\ncost savings in terms of model training, data storage, and collection. However,\nthe quality of RGB images and depth maps is sensor-dependent, and depth maps in\nthe real world exhibit domain-specific characteristics, leading to variations\nin depth ranges. These challenges limit existing methods to lifelong learning\nscenarios with small domain gaps and relative depth map estimation. To\nfacilitate lifelong metric depth learning, we identify three crucial technical\nchallenges that require attention: i) developing a model capable of addressing\nthe depth scale variation through scale-aware depth learning, ii) devising an\neffective learning strategy to handle significant domain gaps, and iii)\ncreating an automated solution for domain-aware depth inference in practical\napplications. Based on the aforementioned considerations, in this paper, we\npresent i) a lightweight multi-head framework that effectively tackles the\ndepth scale imbalance, ii) an uncertainty-aware lifelong learning solution that\nadeptly handles significant domain gaps, and iii) an online domain-specific\npredictor selection method for real-time inference. Through extensive numerical\nstudies, we show that the proposed method can achieve good efficiency,\nstability, and plasticity, leading the benchmarks by 8% to 15%.\n","authors":["Junjie Hu","Chenyou Fan","Liguang Zhou","Qing Gao","Honghai Liu","Tin Lun Lam"],"pdf_url":"https://arxiv.org/pdf/2303.05050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06985v1","updated":"2023-08-14T07:45:54Z","published":"2023-08-14T07:45:54Z","title":"PatchContrast: Self-Supervised Pre-training for 3D Object Detection","summary":"  Accurately detecting objects in the environment is a key challenge for\nautonomous vehicles. However, obtaining annotated data for detection is\nexpensive and time-consuming. We introduce PatchContrast, a novel\nself-supervised point cloud pre-training framework for 3D object detection. We\npropose to utilize two levels of abstraction to learn discriminative\nrepresentation from unlabeled data: proposal-level and patch-level. The\nproposal-level aims at localizing objects in relation to their surroundings,\nwhereas the patch-level adds information about the internal connections between\nthe object's components, hence distinguishing between different objects based\non their individual components. We demonstrate how these levels can be\nintegrated into self-supervised pre-training for various backbones to enhance\nthe downstream 3D detection task. We show that our method outperforms existing\nstate-of-the-art models on three commonly-used 3D detection datasets.\n","authors":["Oren Shrout","Ori Nitzan","Yizhak Ben-Shabat","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2308.06985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02849v2","updated":"2023-08-14T07:38:32Z","published":"2023-04-06T03:45:07Z","title":"Logistic-Normal Likelihoods for Heteroscedastic Label Noise","summary":"  A natural way of estimating heteroscedastic label noise in regression is to\nmodel the observed (potentially noisy) target as a sample from a normal\ndistribution, whose parameters can be learned by minimizing the negative\nlog-likelihood. This formulation has desirable loss attenuation properties, as\nit reduces the contribution of high-error examples. Intuitively, this behavior\ncan improve robustness against label noise by reducing overfitting. We propose\nan extension of this simple and probabilistic approach to classification that\nhas the same desirable loss attenuation properties. Furthermore, we discuss and\naddress some practical challenges of this extension. We evaluate the\neffectiveness of the method by measuring its robustness against label noise in\nclassification. We perform enlightening experiments exploring the inner\nworkings of the method, including sensitivity to hyperparameters, ablation\nstudies, and other insightful analyses.\n","authors":["Erik Englesson","Amir Mehrpanah","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2304.02849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02898v4","updated":"2023-08-14T07:37:27Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n  Multi-Attribute and Language Search Benchmark","summary":"  In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06983v1","updated":"2023-08-14T07:35:43Z","published":"2023-08-14T07:35:43Z","title":"pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based\n  Unsupervised Representation Learning Problems","summary":"  Nearest neighbor (NN) sampling provides more semantic variations than\npre-defined transformations for self-supervised learning (SSL) based image\nrecognition problems. However, its performance is restricted by the quality of\nthe support set, which holds positive samples for the contrastive loss. In this\nwork, we show that the quality of the support set plays a crucial role in any\nnearest neighbor based method for SSL. We then provide a refined baseline\n(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we\nintroduce pseudo nearest neighbors (pNN) to control the quality of the support\nset, wherein, rather than sampling the nearest neighbors, we sample in the\nvicinity of hard nearest neighbors by varying the magnitude of the resultant\nvector and employing a stochastic sampling strategy to improve the performance.\nAdditionally, to stabilize the effects of uncertainty in NN-based learning, we\nemploy a smooth-weight-update approach for training the proposed network.\nEvaluation of the proposed method on multiple public image recognition and\nmedical image recognition datasets shows that it performs up to 8 percent\nbetter than the baseline nearest neighbor method, and is comparable to other\npreviously proposed SSL methods.\n","authors":["Momojit Biswas","Himanshu Buckchash","Dilip K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2308.06983v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2301.08413v3","updated":"2023-08-14T07:25:27Z","published":"2023-01-20T03:39:35Z","title":"Chaos to Order: A Label Propagation Perspective on Source-Free Domain\n  Adaptation","summary":"  Source-free domain adaptation (SFDA), where only a pre-trained source model\nis used to adapt to the target distribution, is a more general approach to\nachieving domain adaptation in the real world. However, it can be challenging\nto capture the inherent structure of the target features accurately due to the\nlack of supervised information on the target domain. By analyzing the\nclustering performance of the target features, we show that they still contain\ncore features related to discriminative attributes but lack the collation of\nsemantic information. Inspired by this insight, we present Chaos to Order\n(CtO), a novel approach for SFDA that strives to constrain semantic credibility\nand propagate label information among target subpopulations. CtO divides the\ntarget data into inner and outlier samples based on the adaptive threshold of\nthe learning state, customizing the learning strategy to fit the data\nproperties best. Specifically, inner samples are utilized for learning\nintra-class structure thanks to their relatively well-clustered properties. The\nlow-density outlier samples are regularized by input consistency to achieve\nhigh accuracy with respect to the ground truth labels. In CtO, by employing\ndifferent learning strategies to propagate the labels from the inner local to\noutlier instances, it clusters the global samples from chaos to order. We\nfurther adaptively regulate the neighborhood affinity of the inner samples to\nconstrain the local semantic credibility. In theoretical and empirical\nanalyses, we demonstrate that our algorithm not only propagates from inner to\noutlier but also prevents local clustering from forming spurious clusters.\nEmpirical evidence demonstrates that CtO outperforms the state of the arts on\nthree public benchmarks: Office-31, Office-Home, and VisDA.\n","authors":["Chunwei Wu","Guitao Cao","Yan Li","Xidong Xi","Wenming Cao","Hong Wang"],"pdf_url":"https://arxiv.org/pdf/2301.08413v3.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.06974v1","updated":"2023-08-14T07:12:31Z","published":"2023-08-14T07:12:31Z","title":"A One Stop 3D Target Reconstruction and multilevel Segmentation Method","summary":"  3D object reconstruction and multilevel segmentation are fundamental to\ncomputer vision research. Existing algorithms usually perform 3D scene\nreconstruction and target objects segmentation independently, and the\nperformance is not fully guaranteed due to the challenge of the 3D\nsegmentation. Here we propose an open-source one stop 3D target reconstruction\nand multilevel segmentation framework (OSTRA), which performs segmentation on\n2D images, tracks multiple instances with segmentation labels in the image\nsequence, and then reconstructs labelled 3D objects or multiple parts with\nMulti-View Stereo (MVS) or RGBD-based 3D reconstruction methods. We extend\nobject tracking and 3D reconstruction algorithms to support continuous\nsegmentation labels to leverage the advances in the 2D image segmentation,\nespecially the Segment-Anything Model (SAM) which uses the pretrained neural\nnetwork without additional training for new scenes, for 3D object segmentation.\nOSTRA supports most popular 3D object models including point cloud, mesh and\nvoxel, and achieves high performance for semantic segmentation, instance\nsegmentation and part segmentation on several 3D datasets. It even surpasses\nthe manual segmentation in scenes with complex structures and occlusions. Our\nmethod opens up a new avenue for reconstructing 3D targets embedded with rich\nmulti-scale segmentation information in complex scenes. OSTRA is available from\nhttps://github.com/ganlab/OSTRA.\n","authors":["Jiexiong Xu","Weikun Zhao","Zhiyan Tang","Xiangchao Gan"],"pdf_url":"https://arxiv.org/pdf/2308.06974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14786v2","updated":"2023-08-14T07:06:03Z","published":"2023-07-27T11:28:33Z","title":"Towards Deeply Unified Depth-aware Panoptic Segmentation with\n  Bi-directional Guidance Learning","summary":"  Depth-aware panoptic segmentation is an emerging topic in computer vision\nwhich combines semantic and geometric understanding for more robust scene\ninterpretation. Recent works pursue unified frameworks to tackle this challenge\nbut mostly still treat it as two individual learning tasks, which limits their\npotential for exploring cross-domain information. We propose a deeply unified\nframework for depth-aware panoptic segmentation, which performs joint\nsegmentation and depth estimation both in a per-segment manner with identical\nobject queries. To narrow the gap between the two tasks, we further design a\ngeometric query enhancement method, which is able to integrate scene geometry\ninto object queries using latent representations. In addition, we propose a\nbi-directional guidance learning approach to facilitate cross-task feature\nlearning by taking advantage of their mutual relations. Our method sets the new\nstate of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS\nand SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown\nto deliver performance improvement even under incomplete supervision labels.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Jin-Peng Lan","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2307.14786v2.pdf","comment":"to be published in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06964v1","updated":"2023-08-14T06:40:20Z","published":"2023-08-14T06:40:20Z","title":"How inter-rater variability relates to aleatoric and epistemic\n  uncertainty: a case study with deep learning-based paraspinal muscle\n  segmentation","summary":"  Recent developments in deep learning (DL) techniques have led to great\nperformance improvement in medical image segmentation tasks, especially with\nthe latest Transformer model and its variants. While labels from fusing\nmulti-rater manual segmentations are often employed as ideal ground truths in\nDL model training, inter-rater variability due to factors such as training\nbias, image noise, and extreme anatomical variability can still affect the\nperformance and uncertainty of the resulting algorithms. Knowledge regarding\nhow inter-rater variability affects the reliability of the resulting DL\nalgorithms, a key element in clinical deployment, can help inform better\ntraining data construction and DL models, but has not been explored\nextensively. In this paper, we measure aleatoric and epistemic uncertainties\nusing test-time augmentation (TTA), test-time dropout (TTD), and deep ensemble\nto explore their relationship with inter-rater variability. Furthermore, we\ncompare UNet and TransUNet to study the impacts of Transformers on model\nuncertainty with two label fusion strategies. We conduct a case study using\nmulti-class paraspinal muscle segmentation from T2w MRIs. Our study reveals the\ninterplay between inter-rater variability and uncertainties, affected by\nchoices of label fusion strategies and DL models.\n","authors":["Parinaz Roshanzamir","Hassan Rivaz","Joshua Ahn","Hamza Mirza","Neda Naghdi","Meagan Anstruther","Michele C. Battié","Maryse Fortin","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.06964v1.pdf","comment":"Accepted in UNSURE MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.06962v1","updated":"2023-08-14T06:32:54Z","published":"2023-08-14T06:32:54Z","title":"Color-NeuS: Reconstructing Neural Implicit Surfaces with Color","summary":"  The reconstruction of object surfaces from multi-view images or monocular\nvideo is a fundamental issue in computer vision. However, much of the recent\nresearch concentrates on reconstructing geometry through implicit or explicit\nmethods. In this paper, we shift our focus towards reconstructing mesh in\nconjunction with color. We remove the view-dependent color from neural volume\nrendering while retaining volume rendering performance through a relighting\nnetwork. Mesh is extracted from the signed distance function (SDF) network for\nthe surface, and color for each surface vertex is drawn from the global color\nnetwork. To evaluate our approach, we conceived a in hand object scanning task\nfeaturing numerous occlusions and dramatic shifts in lighting conditions. We've\ngathered several videos for this task, and the results surpass those of any\nexisting methods capable of reconstructing mesh alongside color. Additionally,\nour method's performance was assessed using public datasets, including DTU,\nBlendedMVS, and OmniObject3D. The results indicated that our method performs\nwell across all these datasets. Project page:\nhttps://colmar-zlicheng.github.io/color_neus.\n","authors":["Licheng Zhong","Lixin Yang","Kailin Li","Haoyu Zhen","Mei Han","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06957v1","updated":"2023-08-14T06:22:49Z","published":"2023-08-14T06:22:49Z","title":"CEmb-SAM: Segment Anything Model with Condition Embedding for Joint\n  Learning from Heterogeneous Datasets","summary":"  Automated segmentation of ultrasound images can assist medical experts with\ndiagnostic and therapeutic procedures. Although using the common modality of\nultrasound, one typically needs separate datasets in order to segment, for\nexample, different anatomical structures or lesions with different levels of\nmalignancy. In this paper, we consider the problem of jointly learning from\nheterogeneous datasets so that the model can improve generalization abilities\nby leveraging the inherent variability among datasets. We merge the\nheterogeneous datasets into one dataset and refer to each component dataset as\na subgroup. We propose to train a single segmentation model so that the model\ncan adapt to each sub-group. For robust segmentation, we leverage recently\nproposed Segment Anything model (SAM) in order to incorporate sub-group\ninformation into the model. We propose SAM with Condition Embedding block\n(CEmb-SAM) which encodes sub-group conditions and combines them with image\nembeddings from SAM. The conditional embedding block effectively adapts SAM to\neach image sub-group by incorporating dataset properties through learnable\nparameters for normalization. Experiments show that CEmb-SAM outperforms the\nbaseline methods on ultrasound image segmentation for peripheral nerves and\nbreast cancer. The experiments highlight the effectiveness of Cemb-SAM in\nlearning from heterogeneous datasets in medical image segmentation tasks.\n","authors":["Dongik Shin","Beomsuk Kim","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2308.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06954v1","updated":"2023-08-14T06:13:27Z","published":"2023-08-14T06:13:27Z","title":"Global Features are All You Need for Image Retrieval and Reranking","summary":"  Utilizing a two-stage paradigm comprising of coarse image retrieval and\nprecise reranking, a well-established image retrieval system is formed. It has\nbeen widely accepted for long time that local feature is imperative to the\nsubsequent stage - reranking, but this requires sizeable storage and computing\ncapacities. We, for the first time, propose an image retrieval paradigm\nleveraging global feature only to enable accurate and lightweight image\nretrieval for both coarse retrieval and reranking, thus the name - SuperGlobal.\nIt consists of several plug-in modules that can be easily integrated into an\nalready trained model, for both coarse retrieval and reranking stage. This\nseries of approaches is inspired by the investigation into Generalized Mean\n(GeM) Pooling. Possessing these tools, we strive to defy the notion that local\nfeature is essential for a high-performance image retrieval paradigm. Extensive\nexperiments demonstrate substantial improvements compared to the state of the\nart in standard benchmarks. Notably, on the Revisited Oxford (ROxford)+1M Hard\ndataset, our single-stage results improve by 8.2% absolute, while our two-stage\nversion gain reaches 3.7% with a strong 7568X speedup. Furthermore, when the\nfull SuperGlobal is compared with the current single-stage state-of-the-art\nmethod, we achieve roughly 17% improvement with a minimal 0.005% time overhead.\nCode: https://github.com/ShihaoShao-GH/SuperGlobal.\n","authors":["Shihao Shao","Kaifeng Chen","Arjun Karpur","Qinghua Cui","Andre Araujo","Bingyi Cao"],"pdf_url":"https://arxiv.org/pdf/2308.06954v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06952v1","updated":"2023-08-14T06:04:50Z","published":"2023-08-14T06:04:50Z","title":"Channel-Wise Contrastive Learning for Learning with Noisy Labels","summary":"  In real-world datasets, noisy labels are pervasive. The challenge of learning\nwith noisy labels (LNL) is to train a classifier that discerns the actual\nclasses from given instances. For this, the model must identify features\nindicative of the authentic labels. While research indicates that genuine label\ninformation is embedded in the learned features of even inaccurately labeled\ndata, it's often intertwined with noise, complicating its direct application.\nAddressing this, we introduce channel-wise contrastive learning (CWCL). This\nmethod distinguishes authentic label information from noise by undertaking\ncontrastive learning across diverse channels. Unlike conventional instance-wise\ncontrastive learning (IWCL), CWCL tends to yield more nuanced and resilient\nfeatures aligned with the authentic labels. Our strategy is twofold: firstly,\nusing CWCL to extract pertinent features to identify cleanly labeled samples,\nand secondly, progressively fine-tuning using these samples. Evaluations on\nseveral benchmark datasets validate our method's superiority over existing\napproaches.\n","authors":["Hui Kang","Sheng Liu","Huaxi Huang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06948v1","updated":"2023-08-14T05:55:38Z","published":"2023-08-14T05:55:38Z","title":"MixBCT: Towards Self-Adapting Backward-Compatible Training","summary":"  The exponential growth of data, alongside advancements in model structures\nand loss functions, has necessitated the enhancement of image retrieval systems\nthrough the utilization of new models with superior feature embeddings.\nHowever, the expensive process of updating the old retrieval database by\nreplacing embeddings poses a challenge. As a solution, backward-compatible\ntraining can be employed to avoid the necessity of updating old retrieval\ndatasets. While previous methods achieved backward compatibility by aligning\nprototypes of the old model, they often overlooked the distribution of the old\nfeatures, thus limiting their effectiveness when the old model's low quality\nleads to a weakly discriminative feature distribution. On the other hand,\ninstance-based methods like L2 regression take into account the distribution of\nold features but impose strong constraints on the performance of the new model\nitself. In this paper, we propose MixBCT, a simple yet highly effective\nbackward-compatible training method that serves as a unified framework for old\nmodels of varying qualities. Specifically, we summarize four constraints that\nare essential for ensuring backward compatibility in an ideal scenario, and we\nconstruct a single loss function to facilitate backward-compatible training.\nOur approach adaptively adjusts the constraint domain for new features based on\nthe distribution of the old embeddings. We conducted extensive experiments on\nthe large-scale face recognition datasets MS1Mv3 and IJB-C to verify the\neffectiveness of our method. The experimental results clearly demonstrate its\nsuperiority over previous methods. Code is available at\nhttps://github.com/yuleung/MixBCT\n","authors":["Yu Liang","Shiliang Zhang","Yaowei Wang","Sheng Xiao","Kenli Li","Xiaoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06947v1","updated":"2023-08-14T05:54:32Z","published":"2023-08-14T05:54:32Z","title":"Knowing Where to Focus: Event-aware Transformer for Video Grounding","summary":"  Recent DETR-based video grounding models have made the model directly predict\nmoment timestamps without any hand-crafted components, such as a pre-defined\nproposal or non-maximum suppression, by learning moment queries. However, their\ninput-agnostic moment queries inevitably overlook an intrinsic temporal\nstructure of a video, providing limited positional information. In this paper,\nwe formulate an event-aware dynamic moment query to enable the model to take\nthe input-specific content and positional information of the video into\naccount. To this end, we present two levels of reasoning: 1) Event reasoning\nthat captures distinctive event units constituting a given video using a slot\nattention mechanism; and 2) moment reasoning that fuses the moment queries with\na given sentence through a gated fusion transformer layer and learns\ninteractions between the moment queries and video-sentence representations to\npredict moment timestamps. Extensive experiments demonstrate the effectiveness\nand efficiency of the event-aware dynamic moment queries, outperforming\nstate-of-the-art approaches on several video grounding benchmarks.\n","authors":["Jinhyun Jang","Jungin Park","Jin Kim","Hyeongjun Kwon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06947v1.pdf","comment":"ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR"},{"id":"http://arxiv.org/abs/2308.06945v1","updated":"2023-08-14T05:37:07Z","published":"2023-08-14T05:37:07Z","title":"Semantic-aware Network for Aerial-to-Ground Image Synthesis","summary":"  Aerial-to-ground image synthesis is an emerging and challenging problem that\naims to synthesize a ground image from an aerial image. Due to the highly\ndifferent layout and object representation between the aerial and ground\nimages, existing approaches usually fail to transfer the components of the\naerial scene into the ground scene. In this paper, we propose a novel framework\nto explore the challenges by imposing enhanced structural alignment and\nsemantic awareness. We introduce a novel semantic-attentive feature\ntransformation module that allows to reconstruct the complex geographic\nstructures by aligning the aerial feature to the ground layout. Furthermore, we\npropose semantic-aware loss functions by leveraging a pre-trained segmentation\nnetwork. The network is enforced to synthesize realistic objects across various\nclasses by separately calculating losses for different classes and balancing\nthem. Extensive experiments including comparisons with previous methods and\nablation studies show the effectiveness of the proposed framework both\nqualitatively and quantitatively.\n","authors":["Jinhyun Jang","Taeyong Song","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06945v1.pdf","comment":"ICIP 2021. Code is available at https://github.com/jinhyunj/SANet"},{"id":"http://arxiv.org/abs/2308.06944v1","updated":"2023-08-14T05:34:36Z","published":"2023-08-14T05:34:36Z","title":"One-shot lip-based biometric authentication: extending behavioral\n  features with authentication phrase information","summary":"  Lip-based biometric authentication (LBBA) is an authentication method based\non a person's lip movements during speech in the form of video data captured by\na camera sensor. LBBA can utilize both physical and behavioral characteristics\nof lip movements without requiring any additional sensory equipment apart from\nan RGB camera. State-of-the-art (SOTA) approaches use one-shot learning to\ntrain deep siamese neural networks which produce an embedding vector out of\nthese features. Embeddings are further used to compute the similarity between\nan enrolled user and a user being authenticated. A flaw of these approaches is\nthat they model behavioral features as style-of-speech without relation to what\nis being said. This makes the system vulnerable to video replay attacks of the\nclient speaking any phrase. To solve this problem we propose a one-shot\napproach which models behavioral features to discriminate against what is being\nsaid in addition to style-of-speech. We achieve this by customizing the GRID\ndataset to obtain required triplets and training a siamese neural network based\non 3D convolutions and recurrent neural network layers. A custom triplet loss\nfor batch-wise hard-negative mining is proposed. Obtained results using an\nopen-set protocol are 3.2% FAR and 3.8% FRR on the test set of the customized\nGRID dataset. Additional analysis of the results was done to quantify the\ninfluence and discriminatory power of behavioral and physical features for\nLBBA.\n","authors":["Brando Koch","Ratko Grbić"],"pdf_url":"https://arxiv.org/pdf/2308.06944v1.pdf","comment":"28 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2210.13869v4","updated":"2023-08-14T05:33:48Z","published":"2022-10-25T09:45:49Z","title":"A jet tagging algorithm of graph network with HaarPooling message\n  passing","summary":"  Recently methods of graph neural networks (GNNs) have been applied to solving\nthe problems in high energy physics (HEP) and have shown its great potential\nfor quark-gluon tagging with graph representation of jet events. In this paper,\nwe introduce an approach of GNNs combined with a HaarPooling operation to\nanalyze the events, called HaarPooling Message Passing neural network (HMPNet).\nIn HMPNet, HaarPooling not only extracts the features of graph, but embeds\nadditional information obtained by clustering of k-means of different particle\nfeatures. We construct Haarpooling from five different features: absolute\nenergy $\\log E$, transverse momentum $\\log p_T$, relative coordinates\n$(\\Delta\\eta,\\Delta\\phi)$, the mixed ones $(\\log E, \\log p_T)$ and $(\\log E,\n\\log p_T, \\Delta\\eta,\\Delta\\phi)$. The results show that an appropriate\nselection of information for HaarPooling enhances the accuracy of quark-gluon\ntagging, as adding extra information of $\\log P_T$ to the HMPNet outperforms\nall the others, whereas adding relative coordinates information\n$(\\Delta\\eta,\\Delta\\phi)$ is not very effective. This implies that by adding\neffective particle features from HaarPooling can achieve much better results\nthan solely pure message passing neutral network (MPNN) can do, which\ndemonstrates significant improvement of feature extraction via the pooling\nprocess. Finally we compare the HMPNet study, ordering by $p_T$, with other\nstudies and prove that the HMPNet is also a good choice of GNN algorithms for\njet tagging.\n","authors":["Fei Ma","Feiyi Liu","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2210.13869v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00847v2","updated":"2023-08-14T05:22:41Z","published":"2022-08-01T13:34:33Z","title":"MAFW: A Large-scale, Multi-modal, Compound Affective Database for\n  Dynamic Facial Expression Recognition in the Wild","summary":"  Dynamic facial expression recognition (FER) databases provide important data\nsupport for affective computing and applications. However, most FER databases\nare annotated with several basic mutually exclusive emotional categories and\ncontain only one modality, e.g., videos. The monotonous labels and modality\ncannot accurately imitate human emotions and fulfill applications in the real\nworld. In this paper, we propose MAFW, a large-scale multi-modal compound\naffective database with 10,045 video-audio clips in the wild. Each clip is\nannotated with a compound emotional category and a couple of sentences that\ndescribe the subjects' affective behaviors in the clip. For the compound\nemotion annotation, each clip is categorized into one or more of the 11\nwidely-used emotions, i.e., anger, disgust, fear, happiness, neutral, sadness,\nsurprise, contempt, anxiety, helplessness, and disappointment. To ensure high\nquality of the labels, we filter out the unreliable annotations by an\nExpectation Maximization (EM) algorithm, and then obtain 11 single-label\nemotion categories and 32 multi-label emotion categories. To the best of our\nknowledge, MAFW is the first in-the-wild multi-modal database annotated with\ncompound emotion annotations and emotion-related captions. Additionally, we\nalso propose a novel Transformer-based expression snippet feature learning\nmethod to recognize the compound emotions leveraging the expression-change\nrelations among different emotions and modalities. Extensive experiments on\nMAFW database show the advantages of the proposed method over other\nstate-of-the-art methods for both uni- and multi-modal FER. Our MAFW database\nis publicly available from https://mafw-database.github.io/MAFW.\n","authors":["Yuanyuan Liu","Wei Dai","Chuanxu Feng","Wenbin Wang","Guanghao Yin","Jiabei Zeng","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2208.00847v2.pdf","comment":"This paper has been accepted by ACM MM'22"},{"id":"http://arxiv.org/abs/2303.08597v5","updated":"2023-08-14T04:44:50Z","published":"2023-03-15T13:07:21Z","title":"Aerial-Ground Person Re-ID","summary":"  Person re-ID matches persons across multiple non-overlapping cameras. Despite\nthe increasing deployment of airborne platforms in surveillance, current\nexisting person re-ID benchmarks' focus is on ground-ground matching and very\nlimited efforts on aerial-aerial matching. We propose a new benchmark dataset -\nAG-ReID, which performs person re-ID matching in a new setting: across aerial\nand ground cameras. Our dataset contains 21,983 images of 388 identities and 15\nsoft attributes for each identity. The data was collected by a UAV flying at\naltitudes between 15 to 45 meters and a ground-based CCTV camera on a\nuniversity campus. Our dataset presents a novel elevated-viewpoint challenge\nfor person re-ID due to the significant difference in person appearance across\nthese cameras. We propose an explainable algorithm to guide the person re-ID\nmodel's training with soft attributes to address this challenge. Experiments\ndemonstrate the efficacy of our method on the aerial-ground person re-ID task.\nThe dataset will be published and the baseline codes will be open-sourced at\nhttps://github.com/huynguyen792/AG-ReID to facilitate research in this area.\n","authors":["Huy Nguyen","Kien Nguyen","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2303.08597v5.pdf","comment":"Published on IEEE International Conference on Multimedia and Expo\n  2023 (ICME2023)"},{"id":"http://arxiv.org/abs/2308.06933v1","updated":"2023-08-14T04:22:36Z","published":"2023-08-14T04:22:36Z","title":"Radiomics-Informed Deep Learning for Classification of Atrial\n  Fibrillation Sub-Types from Left-Atrium CT Volumes","summary":"  Atrial Fibrillation (AF) is characterized by rapid, irregular heartbeats, and\ncan lead to fatal complications such as heart failure. The disease is divided\ninto two sub-types based on severity, which can be automatically classified\nthrough CT volumes for disease screening of severe cases. However, existing\nclassification approaches rely on generic radiomic features that may not be\noptimal for the task, whilst deep learning methods tend to over-fit to the\nhigh-dimensional volume inputs. In this work, we propose a novel\nradiomics-informed deep-learning method, RIDL, that combines the advantages of\ndeep learning and radiomic approaches to improve AF sub-type classification.\nUnlike existing hybrid techniques that mostly rely on na\\\"ive feature\nconcatenation, we observe that radiomic feature selection methods can serve as\nan information prior, and propose supplementing low-level deep neural network\n(DNN) features with locally computed radiomic features. This reduces DNN\nover-fitting and allows local variations between radiomic features to be better\ncaptured. Furthermore, we ensure complementary information is learned by deep\nand radiomic features by designing a novel feature de-correlation loss.\nCombined, our method addresses the limitations of deep learning and radiomic\napproaches and outperforms state-of-the-art radiomic, deep learning, and hybrid\napproaches, achieving 86.9% AUC for the AF sub-type classification task. Code\nis available at https://github.com/xmed-lab/RIDL.\n","authors":["Weihang Dai","Xiaomeng Li","Taihui Yu","Di Zhao","Jun Shen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.06933v1.pdf","comment":"Accepted by MICCAI23"},{"id":"http://arxiv.org/abs/2308.06926v1","updated":"2023-08-14T04:10:45Z","published":"2023-08-14T04:10:45Z","title":"OpenGCD: Assisting Open World Recognition with Generalized Category\n  Discovery","summary":"  A desirable open world recognition (OWR) system requires performing three\ntasks: (1) Open set recognition (OSR), i.e., classifying the known (classes\nseen during training) and rejecting the unknown (unseen$/$novel classes)\nonline; (2) Grouping and labeling these unknown as novel known classes; (3)\nIncremental learning (IL), i.e., continual learning these novel classes and\nretaining the memory of old classes. Ideally, all of these steps should be\nautomated. However, existing methods mostly assume that the second task is\ncompletely done manually. To bridge this gap, we propose OpenGCD that combines\nthree key ideas to solve the above problems sequentially: (a) We score the\norigin of instances (unknown or specifically known) based on the uncertainty of\nthe classifier's prediction; (b) For the first time, we introduce generalized\ncategory discovery (GCD) techniques in OWR to assist humans in grouping\nunlabeled data; (c) For the smooth execution of IL and GCD, we retain an equal\nnumber of informative exemplars for each class with diversity as the goal.\nMoreover, we present a new performance evaluation metric for GCD called\nharmonic clustering accuracy. Experiments on two standard classification\nbenchmarks and a challenging dataset demonstrate that OpenGCD not only offers\nexcellent compatibility but also substantially outperforms other baselines.\nCode: https://github.com/Fulin-Gao/OpenGCD.\n","authors":["Fulin Gao","Weimin Zhong","Zhixing Cao","Xin Peng","Zhi Li"],"pdf_url":"https://arxiv.org/pdf/2308.06926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06925v1","updated":"2023-08-14T04:03:51Z","published":"2023-08-14T04:03:51Z","title":"CBA: Improving Online Continual Learning via Continual Bias Adaptor","summary":"  Online continual learning (CL) aims to learn new knowledge and consolidate\npreviously learned knowledge from non-stationary data streams. Due to the\ntime-varying training setting, the model learned from a changing distribution\neasily forgets the previously learned knowledge and biases toward the newly\nreceived task. To address this problem, we propose a Continual Bias Adaptor\n(CBA) module to augment the classifier network to adapt to catastrophic\ndistribution change during training, such that the classifier network is able\nto learn a stable consolidation of previously learned tasks. In the testing\nstage, CBA can be removed which introduces no additional computation cost and\nmemory overhead. We theoretically reveal the reason why the proposed method can\neffectively alleviate catastrophic distribution shifts, and empirically\ndemonstrate its effectiveness through extensive experiments based on four\nrehearsal-based baselines and three public continual learning benchmarks.\n","authors":["Quanziang Wang","Renzhen Wang","Yichen Wu","Xixi Jia","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.06925v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06909v1","updated":"2023-08-14T03:11:17Z","published":"2023-08-14T03:11:17Z","title":"Hierarchy Flow For High-Fidelity Image-to-Image Translation","summary":"  Image-to-image (I2I) translation comprises a wide spectrum of tasks. Here we\ndivide this problem into three levels: strong-fidelity translation,\nnormal-fidelity translation, and weak-fidelity translation, indicating the\nextent to which the content of the original image is preserved. Although\nexisting methods achieve good performance in weak-fidelity translation, they\nfail to fully preserve the content in both strong- and normal-fidelity tasks,\ne.g. sim2real, style transfer and low-level vision. In this work, we propose\nHierarchy Flow, a novel flow-based model to achieve better content preservation\nduring translation. Specifically, 1) we first unveil the drawbacks of standard\nflow-based models when applied to I2I translation. 2) Next, we propose a new\ndesign, namely hierarchical coupling for reversible feature transformation and\nmulti-scale modeling, to constitute Hierarchy Flow. 3) Finally, we present a\ndedicated aligned-style loss for a better trade-off between content\npreservation and stylization during translation. Extensive experiments on a\nwide range of I2I translation benchmarks demonstrate that our approach achieves\nstate-of-the-art performance, with convincing advantages in both strong- and\nnormal-fidelity tasks. Code and models will be at\nhttps://github.com/WeichenFan/HierarchyFlow.\n","authors":["Weichen Fan","Jinghuan Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06909v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.01909"},{"id":"http://arxiv.org/abs/2308.06905v1","updated":"2023-08-14T02:53:20Z","published":"2023-08-14T02:53:20Z","title":"The Michigan Robotics Undergraduate Curriculum: Defining the Discipline\n  of Robotics for Equity and Excellence","summary":"  The Robotics Major at the University of Michigan was successfully launched in\nthe 2022-23 academic year as an innovative step forward to better serve\nstudents, our communities, and our society. Building on our guiding principle\nof \"Robotics with Respect\" and our larger Robotics Pathways model, the Michigan\nRobotics Major was designed to define robotics as a true academic discipline\nwith both equity and excellence as our highest priorities. Understanding that\ntalent is equally distributed but opportunity is not, the Michigan Robotics\nMajor has embraced an adaptable curriculum that is accessible through a\ndiversity of student pathways and enables successful and sustained career-long\nparticipation in robotics, AI, and automation professions. The results after\nour planning efforts (2019-22) and first academic year (2022-23) have been\nhighly encouraging: more than 100 students declared Robotics as their major,\ncompletion of the Robotics major by our first two graduates, soaring\nenrollments in our Robotics classes, thriving partnerships with Historically\nBlack Colleges and Universities. This document provides our original curricular\nproposal for the Robotics Undergraduate Program at the University of Michigan,\nsubmitted to the Michigan Association of State Universities in April 2022 and\napproved in June 2022. The dissemination of our program design is in the spirit\nof continued growth for higher education towards realizing equity and\nexcellence.\n  The most recent version of this document is also available on Google Docs\nthrough this link: https://ocj.me/robotics_major\n","authors":["Odest Chadwicke Jenkins","Jessy Grizzle","Ella Atkins","Leia Stirling","Elliott Rouse","Mark Guzdial","Damen Provost","Kimberly Mann","Joanna Millunchick"],"pdf_url":"https://arxiv.org/pdf/2308.06905v1.pdf","comment":"49 pages, approximately 25 figures"},{"id":"http://arxiv.org/abs/2308.04669v2","updated":"2023-08-14T02:52:02Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":"  A variety of Neural Radiance Fields (NeRF) methods have recently achieved\nremarkable success in high render speed. However, current accelerating methods\nare specialized and incompatible with various implicit methods, preventing\nreal-time composition over various types of NeRF works. Because NeRF relies on\nsampling along rays, it is possible to provide general guidance for\nacceleration. To that end, we propose a general implicit pipeline for composing\nNeRF objects quickly. Our method enables the casting of dynamic shadows within\nor between objects using analytical light sources while allowing multiple NeRF\nobjects to be seamlessly placed and rendered together with any arbitrary rigid\ntransformations. Mainly, our work introduces a new surface representation known\nas Neural Depth Fields (NeDF) that quickly determines the spatial relationship\nbetween objects by allowing direct intersection computation between rays and\nimplicit surfaces. It leverages an intersection neural network to query NeRF\nfor acceleration instead of depending on an explicit spatial structure.Our\nproposed method is the first to enable both the progressive and interactive\ncomposition of NeRF objects. Additionally, it also serves as a previewing\nplugin for a range of existing NeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v2.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.06904v1","updated":"2023-08-14T02:51:34Z","published":"2023-08-14T02:51:34Z","title":"Exploring Lightweight Hierarchical Vision Transformers for Efficient\n  Visual Tracking","summary":"  Transformer-based visual trackers have demonstrated significant progress\nowing to their superior modeling capabilities. However, existing trackers are\nhampered by low speed, limiting their applicability on devices with limited\ncomputational power. To alleviate this problem, we propose HiT, a new family of\nefficient tracking models that can run at high speed on different devices while\nretaining high performance. The central idea of HiT is the Bridge Module, which\nbridges the gap between modern lightweight transformers and the tracking\nframework. The Bridge Module incorporates the high-level information of deep\nfeatures into the shallow large-resolution features. In this way, it produces\nbetter features for the tracking head. We also propose a novel dual-image\nposition encoding technique that simultaneously encodes the position\ninformation of both the search region and template images. The HiT model\nachieves promising speed with competitive performance. For instance, it runs at\n61 frames per second (fps) on the Nvidia Jetson AGX edge device. Furthermore,\nHiT attains 64.6% AUC on the LaSOT benchmark, surpassing all previous efficient\ntrackers.\n","authors":["Ben Kang","Xin Chen","Dong Wang","Houwen Peng","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06904v1.pdf","comment":"This paper was accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06897v1","updated":"2023-08-14T02:26:49Z","published":"2023-08-14T02:26:49Z","title":"Orthogonal Temporal Interpolation for Zero-Shot Video Recognition","summary":"  Zero-shot video recognition (ZSVR) is a task that aims to recognize video\ncategories that have not been seen during the model training process. Recently,\nvision-language models (VLMs) pre-trained on large-scale image-text pairs have\ndemonstrated impressive transferability for ZSVR. To make VLMs applicable to\nthe video domain, existing methods often use an additional temporal learning\nmodule after the image-level encoder to learn the temporal relationships among\nvideo frames. Unfortunately, for video from unseen categories, we observe an\nabnormal phenomenon where the model that uses spatial-temporal feature performs\nmuch worse than the model that removes temporal learning module and uses only\nspatial feature. We conjecture that improper temporal modeling on video\ndisrupts the spatial feature of the video. To verify our hypothesis, we propose\nFeature Factorization to retain the orthogonal temporal feature of the video\nand use interpolation to construct refined spatial-temporal feature. The model\nusing appropriately refined spatial-temporal feature performs better than the\none using only spatial feature, which verifies the effectiveness of the\northogonal temporal feature for the ZSVR task. Therefore, an Orthogonal\nTemporal Interpolation module is designed to learn a better refined\nspatial-temporal video feature during training. Additionally, a Matching Loss\nis introduced to improve the quality of the orthogonal temporal feature. We\npropose a model called OTI for ZSVR by employing orthogonal temporal\ninterpolation and the matching loss based on VLMs. The ZSVR accuracies on\npopular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI\noutperforms the previous state-of-the-art method by a clear margin.\n","authors":["Yan Zhu","Junbao Zhuo","Bin Ma","Jiajia Geng","Xiaoming Wei","Xiaolin Wei","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06889v1","updated":"2023-08-14T02:02:56Z","published":"2023-08-14T02:02:56Z","title":"Robustness Stress Testing in Medical Image Classification","summary":"  Deep neural networks have shown impressive performance for image-based\ndisease detection. Performance is commonly evaluated through clinical\nvalidation on independent test sets to demonstrate clinically acceptable\naccuracy. Reporting good performance metrics on test sets, however, is not\nalways a sufficient indication of the generalizability and robustness of an\nalgorithm. In particular, when the test data is drawn from the same\ndistribution as the training data, the iid test set performance can be an\nunreliable estimate of the accuracy on new data. In this paper, we employ\nstress testing to assess model robustness and subgroup performance disparities\nin disease detection models. We design progressive stress testing using five\ndifferent bidirectional and unidirectional image perturbations with six\ndifferent severity levels. As a use case, we apply stress tests to measure the\nrobustness of disease detection models for chest X-ray and skin lesion images,\nand demonstrate the importance of studying class and domain-specific model\nbehaviour. Our experiments indicate that some models may yield more robust and\nequitable performance than others. We also find that pretraining\ncharacteristics play an important role in downstream robustness. We conclude\nthat progressive stress testing is a viable and important tool and should\nbecome standard practice in the clinical validation of image-based disease\ndetection models.\n","authors":["Mobarakol Islam","Zeju Li","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.06889v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.06887v1","updated":"2023-08-14T01:47:26Z","published":"2023-08-14T01:47:26Z","title":"Robustified ANNs Reveal Wormholes Between Human Category Percepts","summary":"  The visual object category reports of artificial neural networks (ANNs) are\nnotoriously sensitive to tiny, adversarial image perturbations. Because human\ncategory reports (aka human percepts) are thought to be insensitive to those\nsame small-norm perturbations -- and locally stable in general -- this argues\nthat ANNs are incomplete scientific models of human visual perception.\nConsistent with this, we show that when small-norm image perturbations are\ngenerated by standard ANN models, human object category percepts are indeed\nhighly stable. However, in this very same \"human-presumed-stable\" regime, we\nfind that robustified ANNs reliably discover low-norm image perturbations that\nstrongly disrupt human percepts. These previously undetectable human perceptual\ndisruptions are massive in amplitude, approaching the same level of sensitivity\nseen in robustified ANNs. Further, we show that robustified ANNs support\nprecise perceptual state interventions: they guide the construction of low-norm\nimage perturbations that strongly alter human category percepts toward specific\nprescribed percepts. These observations suggest that for arbitrary starting\npoints in image space, there exists a set of nearby \"wormholes\", each leading\nthe subject from their current category perceptual state into a semantically\nvery different state. Moreover, contemporary ANN models of biological visual\nprocessing are now accurate enough to consistently guide us to those portals.\n","authors":["Guy Gaziv","Michael J. Lee","James J. DiCarlo"],"pdf_url":"https://arxiv.org/pdf/2308.06887v1.pdf","comment":"*Equal contribution"},{"id":"http://arxiv.org/abs/2308.06879v1","updated":"2023-08-14T01:24:18Z","published":"2023-08-14T01:24:18Z","title":"Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in\n  Entropy Minimization","summary":"  Test-time adaptation (TTA) methods, which generally rely on the model's\npredictions (e.g., entropy minimization) to adapt the source pretrained model\nto the unlabeled target domain, suffer from noisy signals originating from 1)\nincorrect or 2) open-set predictions. Long-term stable adaptation is hampered\nby such noisy signals, so training models without such error accumulation is\ncrucial for practical TTA. To address these issues, including open-set TTA, we\npropose a simple yet effective sample selection method inspired by the\nfollowing crucial empirical finding. While entropy minimization compels the\nmodel to increase the probability of its predicted label (i.e., confidence\nvalues), we found that noisy samples rather show decreased confidence values.\nTo be more specific, entropy minimization attempts to raise the confidence\nvalues of an individual sample's prediction, but individual confidence values\nmay rise or fall due to the influence of signals from numerous other\npredictions (i.e., wisdom of crowds). Due to this fact, noisy signals\nmisaligned with such 'wisdom of crowds', generally found in the correct\nsignals, fail to raise the individual confidence values of wrong samples,\ndespite attempts to increase them. Based on such findings, we filter out the\nsamples whose confidence values are lower in the adapted model than in the\noriginal model, as they are likely to be noisy. Our method is widely applicable\nto existing TTA methods and improves their long-term adaptation performance in\nboth image classification (e.g., 49.4% reduced error rates with TENT) and\nsemantic segmentation (e.g., 11.7% gain in mIoU with TENT).\n","authors":["Jungsoo Lee","Debasmit Das","Jaegul Choo","Sungha Choi"],"pdf_url":"https://arxiv.org/pdf/2308.06879v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2210.07346v2","updated":"2023-08-14T01:07:38Z","published":"2022-10-13T20:39:21Z","title":"An Embarrassingly Simple Backdoor Attack on Self-supervised Learning","summary":"  As a new paradigm in machine learning, self-supervised learning (SSL) is\ncapable of learning high-quality representations of complex data without\nrelying on labels. In addition to eliminating the need for labeled data,\nresearch has found that SSL improves the adversarial robustness over supervised\nlearning since lacking labels makes it more challenging for adversaries to\nmanipulate model predictions. However, the extent to which this robustness\nsuperiority generalizes to other types of attacks remains an open question.\n  We explore this question in the context of backdoor attacks. Specifically, we\ndesign and evaluate CTRL, an embarrassingly simple yet highly effective\nself-supervised backdoor attack. By only polluting a tiny fraction of training\ndata (<= 1%) with indistinguishable poisoning samples, CTRL causes any\ntrigger-embedded input to be misclassified to the adversary's designated class\nwith a high probability (>= 99%) at inference time. Our findings suggest that\nSSL and supervised learning are comparably vulnerable to backdoor attacks. More\nimportantly, through the lens of CTRL, we study the inherent vulnerability of\nSSL to backdoor attacks. With both empirical and analytical evidence, we reveal\nthat the representation invariance property of SSL, which benefits adversarial\nrobustness, may also be the very reason making \\ssl highly susceptible to\nbackdoor attacks. Our findings also imply that the existing defenses against\nsupervised backdoor attacks are not easily retrofitted to the unique\nvulnerability of SSL.\n","authors":["Changjiang Li","Ren Pang","Zhaohan Xi","Tianyu Du","Shouling Ji","Yuan Yao","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2210.07346v2.pdf","comment":"The 2023 International Conference on Computer Vision (ICCV '23)"},{"id":"http://arxiv.org/abs/2308.06869v1","updated":"2023-08-14T00:42:03Z","published":"2023-08-14T00:42:03Z","title":"Shape-Graph Matching Network (SGM-net): Registration for Statistical\n  Shape Analysis","summary":"  This paper focuses on the statistical analysis of shapes of data objects\ncalled shape graphs, a set of nodes connected by articulated curves with\narbitrary shapes. A critical need here is a constrained registration of points\n(nodes to nodes, edges to edges) across objects. This, in turn, requires\noptimization over the permutation group, made challenging by differences in\nnodes (in terms of numbers, locations) and edges (in terms of shapes,\nplacements, and sizes) across objects. This paper tackles this registration\nproblem using a novel neural-network architecture and involves an unsupervised\nloss function developed using the elastic shape metric for curves. This\narchitecture results in (1) state-of-the-art matching performance and (2) an\norder of magnitude reduction in the computational cost relative to baseline\napproaches. We demonstrate the effectiveness of the proposed approach using\nboth simulated data and real-world 2D and 3D shape graphs. Code and data will\nbe made publicly available after review to foster research.\n","authors":["Shenyuan Liang","Mauricio Pamplona Segundo","Sathyanarayanan N. Aakur","Sudeep Sarkar","Anuj Srivastava"],"pdf_url":"https://arxiv.org/pdf/2308.06869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00164v2","updated":"2023-08-14T00:16:23Z","published":"2022-10-31T22:12:48Z","title":"Agent-Controller Representations: Principled Offline RL with Rich\n  Exogenous Information","summary":"  Learning to control an agent from data collected offline in a rich\npixel-based visual observation space is vital for real-world applications of\nreinforcement learning (RL). A major challenge in this setting is the presence\nof input information that is hard to model and irrelevant to controlling the\nagent. This problem has been approached by the theoretical RL community through\nthe lens of exogenous information, i.e, any control-irrelevant information\ncontained in observations. For example, a robot navigating in busy streets\nneeds to ignore irrelevant information, such as other people walking in the\nbackground, textures of objects, or birds in the sky. In this paper, we focus\non the setting with visually detailed exogenous information, and introduce new\noffline RL benchmarks offering the ability to study this problem. We find that\ncontemporary representation learning techniques can fail on datasets where the\nnoise is a complex and time dependent process, which is prevalent in practical\napplications. To address these, we propose to use multi-step inverse models,\nwhich have seen a great deal of interest in the RL theory community, to learn\nAgent-Controller Representations for Offline-RL (ACRO). Despite being simple\nand requiring no reward, we show theoretically and empirically that the\nrepresentation created by this objective greatly outperforms baselines.\n","authors":["Riashat Islam","Manan Tomar","Alex Lamb","Yonathan Efroni","Hongyu Zang","Aniket Didolkar","Dipendra Misra","Xin Li","Harm van Seijen","Remi Tachet des Combes","John Langford"],"pdf_url":"https://arxiv.org/pdf/2211.00164v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.06868v1","updated":"2023-08-14T00:15:01Z","published":"2023-08-14T00:15:01Z","title":"Camera Based mmWave Beam Prediction: Towards Multi-Candidate Real-World\n  Scenarios","summary":"  Leveraging sensory information to aid the millimeter-wave (mmWave) and\nsub-terahertz (sub-THz) beam selection process is attracting increasing\ninterest. This sensory data, captured for example by cameras at the\nbasestations, has the potential of significantly reducing the beam sweeping\noverhead and enabling highly-mobile applications. The solutions developed so\nfar, however, have mainly considered single-candidate scenarios, i.e.,\nscenarios with a single candidate user in the visual scene, and were evaluated\nusing synthetic datasets. To address these limitations, this paper extensively\ninvestigates the sensing-aided beam prediction problem in a real-world\nmulti-object vehicle-to-infrastructure (V2I) scenario and presents a\ncomprehensive machine learning-based framework. In particular, this paper\nproposes to utilize visual and positional data to predict the optimal beam\nindices as an alternative to the conventional beam sweeping approaches. For\nthis, a novel user (transmitter) identification solution has been developed, a\nkey step in realizing sensing-aided multi-candidate and multi-user beam\nprediction solutions. The proposed solutions are evaluated on the large-scale\nreal-world DeepSense $6$G dataset. Experimental results in realistic V2I\ncommunication scenarios indicate that the proposed solutions achieve close to\n$100\\%$ top-5 beam prediction accuracy for the scenarios with single-user and\nclose to $95\\%$ top-5 beam prediction accuracy for multi-candidate scenarios.\nFurthermore, the proposed approach can identify the probable transmitting\ncandidate with more than $93\\%$ accuracy across the different scenarios. This\nhighlights a promising approach for nearly eliminating the beam training\noverhead in mmWave/THz communication systems.\n","authors":["Gouranga Charan","Muhammad Alrabeiah","Tawfik Osman","Ahmed Alkhateeb"],"pdf_url":"https://arxiv.org/pdf/2308.06868v1.pdf","comment":"Dataset and code files are available on the DeepSense 6G website\n  https://deepsense6g.net/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.07284v1","updated":"2023-08-14T17:15:37Z","published":"2023-08-14T17:15:37Z","title":"Cross-Attribute Matrix Factorization Model with Shared User Embedding","summary":"  Over the past few years, deep learning has firmly established its prowess\nacross various domains, including computer vision, speech recognition, and\nnatural language processing. Motivated by its outstanding success, researchers\nhave been directing their efforts towards applying deep learning techniques to\nrecommender systems. Neural collaborative filtering (NCF) and Neural Matrix\nFactorization (NeuMF) refreshes the traditional inner product in matrix\nfactorization with a neural architecture capable of learning complex and\ndata-driven functions. While these models effectively capture user-item\ninteractions, they overlook the specific attributes of both users and items.\nThis can lead to robustness issues, especially for items and users that belong\nto the \"long tail\". Such challenges are commonly recognized in recommender\nsystems as a part of the cold-start problem. A direct and intuitive approach to\naddress this issue is by leveraging the features and attributes of the items\nand users themselves. In this paper, we introduce a refined NeuMF model that\nconsiders not only the interaction between users and items, but also acrossing\nassociated attributes. Moreover, our proposed architecture features a shared\nuser embedding, seamlessly integrating with user embeddings to imporve the\nrobustness and effectively address the cold-start problem. Rigorous experiments\non both the Movielens and Pinterest datasets demonstrate the superiority of our\nCross-Attribute Matrix Factorization model, particularly in scenarios\ncharacterized by higher dataset sparsity.\n","authors":["Wen Liang","Zeng Fan","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n  Models","summary":"  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2208.08063v5","updated":"2023-08-14T16:49:07Z","published":"2022-08-17T04:30:58Z","title":"NECE: Narrative Event Chain Extraction Toolkit","summary":"  To understand a narrative, it is essential to comprehend the temporal event\nflows, especially those associated with main characters; however, this can be\nchallenging with lengthy and unstructured narrative texts. To address this, we\nintroduce NECE, an open-access, document-level toolkit that automatically\nextracts and aligns narrative events in the temporal order of their occurrence.\nThrough extensive evaluations, we show the high quality of the NECE toolkit and\ndemonstrates its downstream application in analyzing narrative bias regarding\ngender. We also openly discuss the shortcomings of the current approach, and\npotential of leveraging generative models in future works. Lastly the NECE\ntoolkit includes both a Python library and a user-friendly web interface, which\noffer equal access to professionals and layman audience alike, to visualize\nevent chain, obtain narrative flows, or study narrative bias.\n","authors":["Guangxuan Xu","Paulina Toro Isaza","Moshi Li","Akintoye Oloko","Bingsheng Yao","Cassia Sanctos","Aminat Adebiyi","Yufang Hou","Nanyun Peng","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2208.08063v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.07122v2","updated":"2023-08-14T16:04:21Z","published":"2021-10-14T02:23:33Z","title":"Deconfounded Causal Collaborative Filtering","summary":"  Recommender systems may be confounded by various types of confounding factors\n(also called confounders) that may lead to inaccurate recommendations and\nsacrificed recommendation performance. Current approaches to solving the\nproblem usually design each specific model for each specific confounder.\nHowever, real-world systems may include a huge number of confounders and thus\ndesigning each specific model for each specific confounder could be\nunrealistic. More importantly, except for those ``explicit confounders'' that\nexperts can manually identify and process such as item's position in the\nranking list, there are also many ``latent confounders'' that are beyond the\nimagination of experts. For example, users' rating on a song may depend on\ntheir current mood or the current weather, and users' preference on ice creams\nmay depend on the air temperature. Such latent confounders may be unobservable\nin the recorded training data. To solve the problem, we propose Deconfounded\nCausal Collaborative Filtering (DCCF). We first frame user behaviors with\nunobserved confounders into a causal graph, and then we design a front-door\nadjustment model carefully fused with machine learning to deconfound the\ninfluence of unobserved confounders. Experiments on real-world datasets show\nthat our method is able to deconfound unobserved confounders to achieve better\nrecommendation performance.\n","authors":["Shuyuan Xu","Juntao Tan","Shelby Heinecke","Jia Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2110.07122v2.pdf","comment":"Accepted by the ACM Transactions on Recommender Systems (TORS)"},{"id":"http://arxiv.org/abs/2102.01868v5","updated":"2023-08-14T15:58:36Z","published":"2021-02-03T04:16:11Z","title":"Causal Collaborative Filtering","summary":"  Many of the traditional recommendation algorithms are designed based on the\nfundamental idea of mining or learning correlative patterns from data to\nestimate the user-item correlative preference. However, pure correlative\nlearning may lead to Simpson's paradox in predictions, and thus results in\nsacrificed recommendation performance. Simpson's paradox is a well-known\nstatistical phenomenon, which causes confusions in statistical conclusions and\nignoring the paradox may result in inaccurate decisions. Fortunately, causal\nand counterfactual modeling can help us to think outside of the observational\ndata for user modeling and personalization so as to tackle such issues. In this\npaper, we propose Causal Collaborative Filtering (CCF) -- a general framework\nfor modeling causality in collaborative filtering and recommendation. We\nprovide a unified causal view of CF and mathematically show that many of the\ntraditional CF algorithms are actually special cases of CCF under simplified\ncausal graphs. We then propose a conditional intervention approach for\n$do$-operations so that we can estimate the user-item causal preference based\non the observational data. Finally, we further propose a general counterfactual\nconstrained learning framework for estimating the user-item preferences.\nExperiments are conducted on two types of real-world datasets -- traditional\nand randomized trial data -- and results show that our framework can improve\nthe recommendation performance and reduce the Simpson's paradox problem of many\nCF algorithms.\n","authors":["Shuyuan Xu","Yingqiang Ge","Yunqi Li","Zuohui Fu","Xu Chen","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2102.01868v5.pdf","comment":"Accepted by the 2023 ACM SIGIR International Conference on Theory of\n  Information Retrieval"},{"id":"http://arxiv.org/abs/2308.07222v1","updated":"2023-08-14T15:47:36Z","published":"2023-08-14T15:47:36Z","title":"MM-GEF: Multi-modal representation meet collaborative filtering","summary":"  In modern e-commerce, item content features in various modalities offer\naccurate yet comprehensive information to recommender systems. The majority of\nprevious work either focuses on learning effective item representation during\nmodelling user-item interactions, or exploring item-item relationships by\nanalysing multi-modal features. Those methods, however, fail to incorporate the\ncollaborative item-user-item relationships into the multi-modal feature-based\nitem structure. In this work, we propose a graph-based item structure\nenhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion,\nwhich effectively combines the latent item structure underlying multi-modal\ncontents with the collaborative signals. Instead of processing the content\nfeature in different modalities separately, we show that the early-fusion of\nmulti-modal features provides significant improvement. MM-GEF learns refined\nitem representations by injecting structural information obtained from both\nmulti-modal and collaborative signals. Through extensive experiments on four\npublicly available datasets, we demonstrate systematical improvements of our\nmethod over state-of-the-art multi-modal recommendation methods.\n","authors":["Hao Wu","Alejandro Ariza-Casabona","Bartłomiej Twardowski","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2308.07222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07192v1","updated":"2023-08-14T14:56:40Z","published":"2023-08-14T14:56:40Z","title":"gSASRec: Reducing Overconfidence in Sequential Recommendation Trained\n  with Negative Sampling","summary":"  A large catalogue size is one of the central challenges in training\nrecommendation models: a large number of items makes them memory and\ncomputationally inefficient to compute scores for all items during training,\nforcing these models to deploy negative sampling. However, negative sampling\nincreases the proportion of positive interactions in the training data, and\ntherefore models trained with negative sampling tend to overestimate the\nprobabilities of positive interactions a phenomenon we call overconfidence.\nWhile the absolute values of the predicted scores or probabilities are not\nimportant for the ranking of retrieved recommendations, overconfident models\nmay fail to estimate nuanced differences in the top-ranked items, resulting in\ndegraded performance. In this paper, we show that overconfidence explains why\nthe popular SASRec model underperforms when compared to BERT4Rec. This is\ncontrary to the BERT4Rec authors explanation that the difference in performance\nis due to the bi-directional attention mechanism. To mitigate overconfidence,\nwe propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and\ntheoretically prove that it can mitigate overconfidence. We further propose the\ngSASRec model, an improvement over SASRec that deploys an increased number of\nnegatives and the gBCE loss. We show through detailed experiments on three\ndatasets that gSASRec does not exhibit the overconfidence problem. As a result,\ngSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),\nwhile requiring less training time (e.g. -73% training time on MovieLens-1M).\nMoreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that\ncontain more than 1 million items.\n","authors":["Aleksandr Petrov","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2308.07192v1.pdf","comment":"Accepted at ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":"  The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07107v1","updated":"2023-08-14T12:47:22Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":"  As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15464v4","updated":"2023-08-14T11:30:51Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":"  Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v4.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2306.07946v2","updated":"2023-08-14T11:09:09Z","published":"2023-06-02T14:47:56Z","title":"STUDY: Socially Aware Temporally Causal Decoder Recommender Systems","summary":"  Recommender systems are widely used to help people find items that are\ntailored to their interests. These interests are often influenced by social\nnetworks, making it important to use social network information effectively in\nrecommender systems. This is especially true for demographic groups with\ninterests that differ from the majority. This paper introduces STUDY, a\nSocially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a\nnew socially-aware recommender system architecture that is significantly more\nefficient to learn and train than existing methods. STUDY performs joint\ninference over socially connected groups in a single forward pass of a modified\ntransformer decoder network. We demonstrate the benefits of STUDY in the\nrecommendation of books for students who are dyslexic, or struggling readers.\nDyslexic students often have difficulty engaging with reading material, making\nit critical to recommend books that are tailored to their interests. We worked\nwith our non-profit partner Learning Ally to evaluate STUDY on a dataset of\nstruggling readers. STUDY was able to generate recommendations that more\naccurately predicted student engagement, when compared with existing methods.\n","authors":["Eltayeb Ahmed","Diana Mincu","Lauren Harrell","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2306.07946v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07048v1","updated":"2023-08-14T10:18:24Z","published":"2023-08-14T10:18:24Z","title":"UIPC-MF: User-Item Prototype Connection Matrix Factorization for\n  Explainable Collaborative Filtering","summary":"  Recommending items to potentially interested users has been an important\ncommercial task that faces two main challenges: accuracy and explainability.\nWhile most collaborative filtering models rely on statistical computations on a\nlarge scale of interaction data between users and items and can achieve high\nperformance, they often lack clear explanatory power. We propose UIPC-MF, a\nprototype-based matrix factorization method for explainable collaborative\nfiltering recommendations. In UIPC-MF, both users and items are associated with\nsets of prototypes, capturing general collaborative attributes. To enhance\nexplainability, UIPC-MF learns connection weights that reflect the associative\nrelations between user and item prototypes for recommendations. UIPC-MF\noutperforms other prototype-based baseline methods in terms of Hit Ratio and\nNormalized Discounted Cumulative Gain on three datasets, while also providing\nbetter transparency.\n","authors":["Lei Pan","Von-Wun Soo"],"pdf_url":"https://arxiv.org/pdf/2308.07048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05379v2","updated":"2023-08-14T09:49:21Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n  Self-supervised Learning","summary":"  Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything. In reality, auxiliary query-item interactions extracted from user\nhistorical behavior data of the search log could provide hints to reveal users'\nsearch intents further. Drawing inspiration from this, we devise a novel\nBehavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that\nleverages neighbor queries of target item and neighbor items of target query to\ncomplement target query-item semantic matching. Specifically, our model builds\nmulti-level co-attention for distilling coarse-grained and fine-grained\nsemantic representations from both neighbor and target views. The model\nsubsequently employs neighbor-target self-supervised learning to improve the\naccuracy and robustness of BARL-ASe by strengthening representation and logit\nlearning. Furthermore, we discuss how to deal with the long-tail query-item\nmatching of the mini apps search scenario of Alipay practically. Experiments on\nreal-world industry data and online A/B testing demonstrate our proposal\nachieves promising performance with low latency.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v2.pdf","comment":"Accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.07001v1","updated":"2023-08-14T08:32:22Z","published":"2023-08-14T08:32:22Z","title":"The Scientometrics and Reciprocality Underlying Co-Authorship Panels in\n  Google Scholar Profiles","summary":"  Online academic profiles are used by scholars to reflect a desired image to\ntheir online audience. In Google Scholar, scholars can select a subset of\nco-authors for presentation in a central location on their profile using a\nsocial feature called the Co-authroship panel. In this work, we examine whether\nscientometrics and reciprocality can explain the observed selections. To this\nend, we scrape and thoroughly analyze a novel set of 120,000 Google Scholar\nprofiles, ranging across four disciplines and various academic institutions.\nOur results suggest that scholars tend to favor co-authors with higher\nscientometrics over others for inclusion in their co-authorship panels.\nInterestingly, as one's own scientometrics are higher, the tendency to include\nco-authors with high scientometrics is diminishing. Furthermore, we find that\nreciprocality is central to explaining scholars' selections.\n","authors":["Ariel Alexi","Teddy Lazebnik","Ariel Rosenfeld"],"pdf_url":"https://arxiv.org/pdf/2308.07001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04067v2","updated":"2023-08-14T07:53:26Z","published":"2023-08-08T06:04:17Z","title":"Online Distillation-enhanced Multi-modal Transformer for Sequential\n  Recommendation","summary":"  Multi-modal recommendation systems, which integrate diverse types of\ninformation, have gained widespread attention in recent years. However,\ncompared to traditional collaborative filtering-based multi-modal\nrecommendation systems, research on multi-modal sequential recommendation is\nstill in its nascent stages. Unlike traditional sequential recommendation\nmodels that solely rely on item identifier (ID) information and focus on\nnetwork structure design, multi-modal recommendation models need to emphasize\nitem representation learning and the fusion of heterogeneous data sources. This\npaper investigates the impact of item representation learning on downstream\nrecommendation tasks and examines the disparities in information fusion at\ndifferent stages. Empirical experiments are conducted to demonstrate the need\nto design a framework suitable for collaborative learning and fusion of diverse\ninformation. Based on this, we propose a new model-agnostic framework for\nmulti-modal sequential recommendation tasks, called Online\nDistillation-enhanced Multi-modal Transformer (ODMT), to enhance feature\ninteraction and mutual learning among multi-source input (ID, text, and image),\nwhile avoiding conflicts among different features during training, thereby\nimproving recommendation accuracy. To be specific, we first introduce an\nID-aware Multi-modal Transformer module in the item representation learning\nstage to facilitate information interaction among different features. Secondly,\nwe employ an online distillation training strategy in the prediction\noptimization stage to make multi-source data learn from each other and improve\nprediction robustness. Experimental results on a stream media recommendation\ndataset and three e-commerce recommendation datasets demonstrate the\neffectiveness of the proposed two modules, which is approximately 10%\nimprovement in performance compared to baseline models.\n","authors":["Wei Ji","Xiangyan Liu","An Zhang","Yinwei Wei","Yongxin Ni","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04067v2.pdf","comment":"11 pages, 7 figures, accepted in ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06982v1","updated":"2023-08-14T07:35:14Z","published":"2023-08-14T07:35:14Z","title":"Discrete Conditional Diffusion for Reranking in Recommendation","summary":"  Reranking plays a crucial role in modern multi-stage recommender systems by\nrearranging the initial ranking list to model interplay between items.\nConsidering the inherent challenges of reranking such as combinatorial\nsearching space, some previous studies have adopted the evaluator-generator\nparadigm, with a generator producing feasible sequences and a evaluator\nselecting the best one based on estimated listwise utility. Inspired by the\nremarkable success of diffusion generative models, this paper explores the\npotential of diffusion models for generating high-quality sequences in\nreranking. However, we argue that it is nontrivial to take diffusion models as\nthe generator in the context of recommendation. Firstly, diffusion models\nprimarily operate in continuous data space, differing from the discrete data\nspace of item permutations. Secondly, the recommendation task is different from\nconventional generation tasks as the purpose of recommender systems is to\nfulfill user interests. Lastly, real-life recommender systems require\nefficiency, posing challenges for the inference of diffusion models. To\novercome these challenges, we propose a novel Discrete Conditional Diffusion\nReranking (DCDR) framework for recommendation. DCDR extends traditional\ndiffusion models by introducing a discrete forward process with tractable\nposteriors, which adds noise to item sequences through step-wise discrete\noperations (e.g., swapping). Additionally, DCDR incorporates a conditional\nreverse process that generates item sequences conditioned on expected user\nresponses. Extensive offline experiments conducted on public datasets\ndemonstrate that DCDR outperforms state-of-the-art reranking methods.\nFurthermore, DCDR has been deployed in a real-world video app with over 300\nmillion daily active users, significantly enhancing online recommendation\nquality.\n","authors":["Xiao Lin","Xiaokai Chen","Chenyang Wang","Hantao Shu","Linfeng Song","Biao Li","Peng jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06965v1","updated":"2023-08-14T06:43:59Z","published":"2023-08-14T06:43:59Z","title":"AutoAssign+: Automatic Shared Embedding Assignment in Streaming\n  Recommendation","summary":"  In the domain of streaming recommender systems, conventional methods for\naddressing new user IDs or item IDs typically involve assigning initial ID\nembeddings randomly. However, this practice results in two practical\nchallenges: (i) Items or users with limited interactive data may yield\nsuboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs\nnecessitates consistently expanding the embedding table, leading to unnecessary\nmemory consumption. In light of these concerns, we introduce a reinforcement\nlearning-driven framework, namely AutoAssign+, that facilitates Automatic\nShared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an\nIdentity Agent as an actor network, which plays a dual role: (i) Representing\nlow-frequency IDs field-wise with a small set of shared embeddings to enhance\nthe embedding initialization, and (ii) Dynamically determining which ID\nfeatures should be retained or eliminated in the embedding table. The policy of\nthe agent is optimized with the guidance of a critic network. To evaluate the\neffectiveness of our approach, we perform extensive experiments on three\ncommonly used benchmark datasets. Our experiment results demonstrate that\nAutoAssign+ is capable of significantly enhancing recommendation performance by\nmitigating the cold-start problem. Furthermore, our framework yields a\nreduction in memory usage of approximately 20-30%, verifying its practical\neffectiveness and efficiency for streaming recommender systems.\n","authors":["Ziru Liu","Kecheng Chen","Fengyi Song","Bo Chen","Xiangyu Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07705v4","updated":"2023-08-14T03:48:45Z","published":"2023-06-13T11:46:37Z","title":"KuaiSAR: A Unified Search And Recommendation Dataset","summary":"  The confluence of Search and Recommendation (S&R) services is vital to online\nservices, including e-commerce and video platforms. The integration of S&R\nmodeling is a highly intuitive approach adopted by industry practitioners.\nHowever, there is a noticeable lack of research conducted in this area within\nacademia, primarily due to the absence of publicly available datasets.\nConsequently, a substantial gap has emerged between academia and industry\nregarding research endeavors in joint optimization using user behavior data\nfrom both S&R services. To bridge this gap, we introduce the first large-scale,\nreal-world dataset KuaiSAR of integrated Search And Recommendation behaviors\ncollected from Kuaishou, a leading short-video app in China with over 350\nmillion daily active users. Previous research in this field has predominantly\nemployed publicly available semi-synthetic datasets and simulated, with\nartificially fabricated search behaviors. Distinct from previous datasets,\nKuaiSAR contains genuine user behaviors, including the occurrence of each\ninteraction within either search or recommendation service, and the users'\ntransitions between the two services. This work aids in joint modeling of S&R,\nand utilizing search data for recommender systems (and recommendation data for\nsearch engines). Furthermore, due to the various feedback labels associated\nwith user-video interactions, KuaiSAR also supports a broad range of tasks,\nincluding intent recommendation, multi-task learning, and modeling of long\nsequential multi-behavioral patterns. We believe this dataset will serve as a\ncatalyst for innovative research and bridge the gap between academia and\nindustry in understanding the S&R services in practical, real-world\napplications.\n","authors":["Zhongxiang Sun","Zihua Si","Xiaoxue Zang","Dewei Leng","Yanan Niu","Yang Song","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2306.07705v4.pdf","comment":"CIKM 2023 resource track"},{"id":"http://arxiv.org/abs/2308.05508v2","updated":"2023-08-14T01:48:12Z","published":"2023-08-10T11:41:34Z","title":"Multi-domain Recommendation with Embedding Disentangling and Domain\n  Alignment","summary":"  Multi-domain recommendation (MDR) aims to provide recommendations for\ndifferent domains (e.g., types of products) with overlapping users/items and is\ncommon for platforms such as Amazon, Facebook, and LinkedIn that host multiple\nservices. Existing MDR models face two challenges: First, it is difficult to\ndisentangle knowledge that generalizes across domains (e.g., a user likes cheap\nitems) and knowledge specific to a single domain (e.g., a user likes blue\nclothing but not blue cars). Second, they have limited ability to transfer\nknowledge across domains with small overlaps. We propose a new MDR method named\nEDDA with two key components, i.e., embedding disentangling recommender and\ndomain alignment, to tackle the two challenges respectively. In particular, the\nembedding disentangling recommender separates both the model and embedding for\nthe inter-domain part and the intra-domain part, while most existing MDR\nmethods only focus on model-level disentangling. The domain alignment leverages\nrandom walks from graph processing to identify similar user/item pairs from\ndifferent domains and encourages similar user/item pairs to have similar\nembeddings, enhancing knowledge transfer. We compare EDDA with 12\nstate-of-the-art baselines on 3 real datasets. The results show that EDDA\nconsistently outperforms the baselines on all datasets and domains. All\ndatasets and codes are available at https://github.com/Stevenn9981/EDDA.\n","authors":["Wentao Ning","Xiao Yan","Weiwen Liu","Reynold Cheng","Rui Zhang","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.05508v2.pdf","comment":"Accepted by CIKM'23 as a Long paper"},{"id":"http://arxiv.org/abs/2308.06885v1","updated":"2023-08-14T01:37:02Z","published":"2023-08-14T01:37:02Z","title":"Bridging Offline-Online Evaluation with a Time-dependent and Popularity\n  Bias-free Offline Metric for Recommenders","summary":"  The evaluation of recommendation systems is a complex task. The offline and\nonline evaluation metrics for recommender systems are ambiguous in their true\nobjectives. The majority of recently published papers benchmark their methods\nusing ill-posed offline evaluation methodology that often fails to predict true\nonline performance. Because of this, the impact that academic research has on\nthe industry is reduced. The aim of our research is to investigate and compare\nthe online performance of offline evaluation metrics. We show that penalizing\npopular items and considering the time of transactions during the evaluation\nsignificantly improves our ability to choose the best recommendation model for\na live recommender system. Our results, averaged over five large-size\nreal-world live data procured from recommenders, aim to help the academic\ncommunity to understand better offline evaluation and optimization criteria\nthat are more relevant for real applications of recommender systems.\n","authors":["Petr Kasalický","Rodrigo Alves","Pavel Kordík"],"pdf_url":"https://arxiv.org/pdf/2308.06885v1.pdf","comment":"Accepted to evalRS 2023@KDD"},{"id":"http://arxiv.org/abs/2308.06878v1","updated":"2023-08-14T01:23:37Z","published":"2023-08-14T01:23:37Z","title":"AutoSeqRec: Autoencoder for Efficient Sequential Recommendation","summary":"  Sequential recommendation demonstrates the capability to recommend items by\nmodeling the sequential behavior of users. Traditional methods typically treat\nusers as sequences of items, overlooking the collaborative relationships among\nthem. Graph-based methods incorporate collaborative information by utilizing\nthe user-item interaction graph. However, these methods sometimes face\nchallenges in terms of time complexity and computational efficiency. To address\nthese limitations, this paper presents AutoSeqRec, an incremental\nrecommendation model specifically designed for sequential recommendation tasks.\nAutoSeqRec is based on autoencoders and consists of an encoder and three\ndecoders within the autoencoder architecture. These components consider both\nthe user-item interaction matrix and the rows and columns of the item\ntransition matrix. The reconstruction of the user-item interaction matrix\ncaptures user long-term preferences through collaborative filtering. In\naddition, the rows and columns of the item transition matrix represent the item\nout-degree and in-degree hopping behavior, which allows for modeling the user's\nshort-term interests. When making incremental recommendations, only the input\nmatrices need to be updated, without the need to update parameters, which makes\nAutoSeqRec very efficient. Comprehensive evaluations demonstrate that\nAutoSeqRec outperforms existing methods in terms of accuracy, while showcasing\nits robustness and efficiency.\n","authors":["Sijia Liu","Jiahao Liu","Hansu Gu","Dongsheng Li","Tun Lu","Peng Zhang","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.06878v1.pdf","comment":"10 pages, accepted by CIKM 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.03153v2","updated":"2023-08-14T17:38:23Z","published":"2023-05-04T21:04:19Z","title":"G-MATT: Single-step Retrosynthesis Prediction using Molecular Grammar\n  Tree Transformer","summary":"  Various template-based and template-free approaches have been proposed for\nsingle-step retrosynthesis prediction in recent years. While these approaches\ndemonstrate strong performance from a data-driven metrics standpoint, many\nmodel architectures do not incorporate underlying chemistry principles. Here,\nwe propose a novel chemistry-aware retrosynthesis prediction framework that\ncombines powerful data-driven models with prior domain knowledge. We present a\ntree-to-sequence transformer architecture that utilizes hierarchical SMILES\ngrammar-based trees, incorporating crucial chemistry information that is often\noverlooked by SMILES text-based representations, such as local structures and\nfunctional groups. The proposed framework, grammar-based molecular attention\ntree transformer (G-MATT), achieves significant performance improvements\ncompared to baseline retrosynthesis models. G-MATT achieves a promising top-1\naccuracy of 51% (top-10 accuracy of 79.1%), invalid rate of 1.5%, and bioactive\nsimilarity rate of 74.8% on the USPTO- 50K dataset. Additional analyses of\nG-MATT attention maps demonstrate the ability to retain chemistry knowledge\nwithout relying on excessively complex model architectures.\n","authors":["Kevin Zhang","Vipul Mann","Venkat Venkatasubramanian"],"pdf_url":"https://arxiv.org/pdf/2305.03153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07293v1","updated":"2023-08-14T17:29:41Z","published":"2023-08-14T17:29:41Z","title":"DiffSED: Sound Event Detection with Denoising Diffusion","summary":"  Sound Event Detection (SED) aims to predict the temporal boundaries of all\nthe events of interest and their class labels, given an unconstrained audio\nsample. Taking either the splitand-classify (i.e., frame-level) strategy or the\nmore principled event-level modeling approach, all existing methods consider\nthe SED problem from the discriminative learning perspective. In this work, we\nreformulate the SED problem by taking a generative learning perspective.\nSpecifically, we aim to generate sound temporal boundaries from noisy proposals\nin a denoising diffusion process, conditioned on a target audio sample. During\ntraining, our model learns to reverse the noising process by converting noisy\nlatent queries to the groundtruth versions in the elegant Transformer decoder\nframework. Doing so enables the model generate accurate event boundaries from\neven noisy queries during inference. Extensive experiments on the Urban-SED and\nEPIC-Sounds datasets demonstrate that our model significantly outperforms\nexisting alternatives, with 40+% faster convergence in training.\n","authors":["Swapnil Bhosale","Sauradip Nag","Diptesh Kanojia","Jiankang Deng","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.07293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07539v2","updated":"2023-08-14T17:22:21Z","published":"2023-07-14T13:56:11Z","title":"On the Sublinear Regret of GP-UCB","summary":"  In the kernelized bandit problem, a learner aims to sequentially compute the\noptimum of a function lying in a reproducing kernel Hilbert space given only\nnoisy evaluations at sequentially chosen points. In particular, the learner\naims to minimize regret, which is a measure of the suboptimality of the choices\nmade. Arguably the most popular algorithm is the Gaussian Process Upper\nConfidence Bound (GP-UCB) algorithm, which involves acting based on a simple\nlinear estimator of the unknown function. Despite its popularity, existing\nanalyses of GP-UCB give a suboptimal regret rate, which fails to be sublinear\nfor many commonly used kernels such as the Mat\\'ern kernel. This has led to a\nlongstanding open question: are existing regret analyses for GP-UCB tight, or\ncan bounds be improved by using more sophisticated analytical techniques? In\nthis work, we resolve this open question and show that GP-UCB enjoys nearly\noptimal regret. In particular, our results yield sublinear regret rates for the\nMat\\'ern kernel, improving over the state-of-the-art analyses and partially\nresolving a COLT open problem posed by Vakili et al. Our improvements rely on a\nkey technical contribution -- regularizing kernel ridge estimators in\nproportion to the smoothness of the underlying kernel $k$. Applying this key\nidea together with a largely overlooked concentration result in separable\nHilbert spaces (for which we provide an independent, simplified derivation), we\nare able to provide a tighter analysis of the GP-UCB algorithm.\n","authors":["Justin Whitehouse","Zhiwei Steven Wu","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2307.07539v2.pdf","comment":"20 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.07286v1","updated":"2023-08-14T17:17:21Z","published":"2023-08-14T17:17:21Z","title":"The Devil is in the Errors: Leveraging Large Language Models for\n  Fine-grained Machine Translation Evaluation","summary":"  Automatic evaluation of machine translation (MT) is a critical tool driving\nthe rapid iterative development of MT systems. While considerable progress has\nbeen made on estimating a single scalar quality score, current metrics lack the\ninformativeness of more detailed schemes that annotate individual errors, such\nas Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap\nby proposing AutoMQM, a prompting technique which leverages the reasoning and\nin-context learning capabilities of large language models (LLMs) and asks them\nto identify and categorize errors in translations. We start by evaluating\nrecent LLMs, such as PaLM and PaLM-2, through simple score prediction\nprompting, and we study the impact of labeled data through in-context learning\nand finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that\nit improves performance compared to just prompting for scores (with\nparticularly large gains for larger models) while providing interpretability\nthrough error spans that align with human annotations.\n","authors":["Patrick Fernandes","Daniel Deutsch","Mara Finkelstein","Parker Riley","André F. T. Martins","Graham Neubig","Ankush Garg","Jonathan H. Clark","Markus Freitag","Orhan Firat"],"pdf_url":"https://arxiv.org/pdf/2308.07286v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.07284v1","updated":"2023-08-14T17:15:37Z","published":"2023-08-14T17:15:37Z","title":"Cross-Attribute Matrix Factorization Model with Shared User Embedding","summary":"  Over the past few years, deep learning has firmly established its prowess\nacross various domains, including computer vision, speech recognition, and\nnatural language processing. Motivated by its outstanding success, researchers\nhave been directing their efforts towards applying deep learning techniques to\nrecommender systems. Neural collaborative filtering (NCF) and Neural Matrix\nFactorization (NeuMF) refreshes the traditional inner product in matrix\nfactorization with a neural architecture capable of learning complex and\ndata-driven functions. While these models effectively capture user-item\ninteractions, they overlook the specific attributes of both users and items.\nThis can lead to robustness issues, especially for items and users that belong\nto the \"long tail\". Such challenges are commonly recognized in recommender\nsystems as a part of the cold-start problem. A direct and intuitive approach to\naddress this issue is by leveraging the features and attributes of the items\nand users themselves. In this paper, we introduce a refined NeuMF model that\nconsiders not only the interaction between users and items, but also acrossing\nassociated attributes. Moreover, our proposed architecture features a shared\nuser embedding, seamlessly integrating with user embeddings to imporve the\nrobustness and effectively address the cold-start problem. Rigorous experiments\non both the Movielens and Pinterest datasets demonstrate the superiority of our\nCross-Attribute Matrix Factorization model, particularly in scenarios\ncharacterized by higher dataset sparsity.\n","authors":["Wen Liang","Zeng Fan","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07273v1","updated":"2023-08-14T17:00:13Z","published":"2023-08-14T17:00:13Z","title":"Data-Efficient Energy-Aware Participant Selection for UAV-Enabled\n  Federated Learning","summary":"  Unmanned aerial vehicle (UAV)-enabled edge federated learning (FL) has\nsparked a rise in research interest as a result of the massive and\nheterogeneous data collected by UAVs, as well as the privacy concerns related\nto UAV data transmissions to edge servers. However, due to the redundancy of\nUAV collected data, e.g., imaging data, and non-rigorous FL participant\nselection, the convergence time of the FL learning process and bias of the FL\nmodel may increase. Consequently, we investigate in this paper the problem of\nselecting UAV participants for edge FL, aiming to improve the FL model's\naccuracy, under UAV constraints of energy consumption, communication quality,\nand local datasets' heterogeneity. We propose a novel UAV participant selection\nscheme, called data-efficient energy-aware participant selection strategy\n(DEEPS), which consists of selecting the best FL participant in each sub-region\nbased on the structural similarity index measure (SSIM) average score of its\nlocal dataset and its power consumption profile. Through experiments, we\ndemonstrate that the proposed selection scheme is superior to the benchmark\nrandom selection method, in terms of model accuracy, training time, and UAV\nenergy consumption.\n","authors":["Youssra Cheriguene","Wael Jaafar","Chaker Abdelaziz Kerrache","Halim Yanikomeroglu","Fatima Zohra Bousbaa","Nasreddine Lagraa"],"pdf_url":"https://arxiv.org/pdf/2308.07273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07272v1","updated":"2023-08-14T16:58:50Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n  Optimization for Few-shot Learning","summary":"  Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n  Models","summary":"  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2305.12622v2","updated":"2023-08-14T16:49:03Z","published":"2023-05-22T01:27:51Z","title":"Evaluating the Impact of Social Determinants on Health Prediction in the\n  Intensive Care Unit","summary":"  Social determinants of health (SDOH) -- the conditions in which people live,\ngrow, and age -- play a crucial role in a person's health and well-being. There\nis a large, compelling body of evidence in population health studies showing\nthat a wide range of SDOH is strongly correlated with health outcomes. Yet, a\nmajority of the risk prediction models based on electronic health records (EHR)\ndo not incorporate a comprehensive set of SDOH features as they are often noisy\nor simply unavailable. Our work links a publicly available EHR database,\nMIMIC-IV, to well-documented SDOH features. We investigate the impact of such\nfeatures on common EHR prediction tasks across different patient populations.\nWe find that community-level SDOH features do not improve model performance for\na general patient population, but can improve data-limited model fairness for\nspecific subpopulations. We also demonstrate that SDOH features are vital for\nconducting thorough audits of algorithmic biases beyond protective attributes.\nWe hope the new integrated EHR-SDOH database will enable studies on the\nrelationship between community health and individual outcomes and provide new\nbenchmarks to study algorithmic biases beyond race, gender, and age.\n","authors":["Ming Ying Yang","Gloria Hyunjung Kwak","Tom Pollard","Leo Anthony Celi","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2305.12622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07250v1","updated":"2023-08-14T16:34:47Z","published":"2023-08-14T16:34:47Z","title":"LCE -- An Augmented Combination of Bagging and Boosting in Python","summary":"  lcensemble is a high-performing, scalable and user-friendly Python package\nfor the general tasks of classification and regression. The package implements\nLocal Cascade Ensemble (LCE), a machine learning method that further enhances\nthe prediction performance of the current state-of-the-art methods Random\nForest and XGBoost. LCE combines their strengths and adopts a complementary\ndiversification approach to obtain a better generalizing predictor. The package\nis compatible with scikit-learn, therefore it can interact with scikit-learn\npipelines and model selection tools. It is distributed under the Apache 2.0\nlicense, and its source code is available at\nhttps://github.com/LocalCascadeEnsemble/LCE.\n","authors":["Kevin Fauvel","Élisa Fromont","Véronique Masson","Philippe Faverdin","Alexandre Termier"],"pdf_url":"https://arxiv.org/pdf/2308.07250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03202v2","updated":"2023-08-14T16:33:43Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":"  Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07247v1","updated":"2023-08-14T16:32:24Z","published":"2023-08-14T16:32:24Z","title":"Can we Agree? On the Rashōmon Effect and the Reliability of Post-Hoc\n  Explainable AI","summary":"  The Rash\\=omon effect poses challenges for deriving reliable knowledge from\nmachine learning models. This study examined the influence of sample size on\nexplanations from models in a Rash\\=omon set using SHAP. Experiments on 5\npublic datasets showed that explanations gradually converged as the sample size\nincreased. Explanations from <128 samples exhibited high variability, limiting\nreliable knowledge extraction. However, agreement between models improved with\nmore data, allowing for consensus. Bagging ensembles often had higher\nagreement. The results provide guidance on sufficient data to trust\nexplanations. Variability at low samples suggests that conclusions may be\nunreliable without validation. Further work is needed with more model types,\ndata domains, and explanation methods. Testing convergence in neural networks\nand with model-specific explanation methods would be impactful. The approaches\nexplored here point towards principled techniques for eliciting knowledge from\nambiguous models.\n","authors":["Clement Poiret","Antoine Grigis","Justin Thomas","Marion Noulhiane"],"pdf_url":"https://arxiv.org/pdf/2308.07247v1.pdf","comment":"13 pages, 6 figures and 6 tables"},{"id":"http://arxiv.org/abs/2308.07233v1","updated":"2023-08-14T16:16:31Z","published":"2023-08-14T16:16:31Z","title":"A Unifying Generator Loss Function for Generative Adversarial Networks","summary":"  A unifying $\\alpha$-parametrized generator loss function is introduced for a\ndual-objective generative adversarial network (GAN), which uses a canonical (or\nclassical) discriminator loss function such as the one in the original GAN\n(VanillaGAN) system. The generator loss function is based on a symmetric class\nprobability estimation type function, $\\mathcal{L}_\\alpha$, and the resulting\nGAN system is termed $\\mathcal{L}_\\alpha$-GAN. Under an optimal discriminator,\nit is shown that the generator's optimization problem consists of minimizing a\nJensen-$f_\\alpha$-divergence, a natural generalization of the Jensen-Shannon\ndivergence, where $f_\\alpha$ is a convex function expressed in terms of the\nloss function $\\mathcal{L}_\\alpha$. It is also demonstrated that this\n$\\mathcal{L}_\\alpha$-GAN problem recovers as special cases a number of GAN\nproblems in the literature, including VanillaGAN, Least Squares GAN (LSGAN),\nLeast $k$th order GAN (L$k$GAN) and the recently introduced\n$(\\alpha_D,\\alpha_G)$-GAN with $\\alpha_D=1$. Finally, experimental results are\nconducted on three datasets, MNIST, CIFAR-10, and Stacked MNIST to illustrate\nthe performance of various examples of the $\\mathcal{L}_\\alpha$-GAN system.\n","authors":["Justin Veiner","Fady Alajaji","Bahman Gharesifard"],"pdf_url":"https://arxiv.org/pdf/2308.07233v1.pdf","comment":"31 pages, 4 figures, 12 tables"},{"id":"http://arxiv.org/abs/2307.07522v2","updated":"2023-08-14T16:12:00Z","published":"2023-07-09T21:16:56Z","title":"The Future of Fundamental Science Led by Generative Closed-Loop\n  Artificial Intelligence","summary":"  Recent advances in machine learning and AI, including Generative AI and LLMs,\nare disrupting technological innovation, product development, and society as a\nwhole. AI's contribution to technology can come from multiple approaches that\nrequire access to large training data sets and clear performance evaluation\ncriteria, ranging from pattern recognition and classification to generative\nmodels. Yet, AI has contributed less to fundamental science in part because\nlarge data sets of high-quality data for scientific practice and model\ndiscovery are more difficult to access. Generative AI, in general, and Large\nLanguage Models in particular, may represent an opportunity to augment and\naccelerate the scientific discovery of fundamental deep science with\nquantitative models. Here we explore and investigate aspects of an AI-driven,\nautomated, closed-loop approach to scientific discovery, including self-driven\nhypothesis generation and open-ended autonomous exploration of the hypothesis\nspace. Integrating AI-driven automation into the practice of science would\nmitigate current problems, including the replication of findings, systematic\nproduction of data, and ultimately democratisation of the scientific process.\nRealising these possibilities requires a vision for augmented AI coupled with a\ndiversity of AI approaches able to deal with fundamental aspects of causality\nanalysis and model discovery while enabling unbiased search across the space of\nputative explanations. These advances hold the promise to unleash AI's\npotential for searching and discovering the fundamental structure of our world\nbeyond what human scientists have been able to achieve. Such a vision would\npush the boundaries of new fundamental science rather than automatize current\nworkflows and instead open doors for technological innovation to tackle some of\nthe greatest challenges facing humanity today.\n","authors":["Hector Zenil","Jesper Tegnér","Felipe S. Abrahão","Alexander Lavin","Vipin Kumar","Jeremy G. Frey","Adrian Weller","Larisa Soldatova","Alan R. Bundy","Nicholas R. Jennings","Koichi Takahashi","Lawrence Hunter","Saso Dzeroski","Andrew Briggs","Frederick D. Gregory","Carla P. Gomes","Christopher K. I. Williams","Jon Rowe","James Evans","Hiroaki Kitano","Joshua B. Tenenbaum","Ross King"],"pdf_url":"https://arxiv.org/pdf/2307.07522v2.pdf","comment":"35 pages, first draft of the final report from the Alan Turing\n  Institute on AI for Scientific Discovery"},{"id":"http://arxiv.org/abs/2110.07122v2","updated":"2023-08-14T16:04:21Z","published":"2021-10-14T02:23:33Z","title":"Deconfounded Causal Collaborative Filtering","summary":"  Recommender systems may be confounded by various types of confounding factors\n(also called confounders) that may lead to inaccurate recommendations and\nsacrificed recommendation performance. Current approaches to solving the\nproblem usually design each specific model for each specific confounder.\nHowever, real-world systems may include a huge number of confounders and thus\ndesigning each specific model for each specific confounder could be\nunrealistic. More importantly, except for those ``explicit confounders'' that\nexperts can manually identify and process such as item's position in the\nranking list, there are also many ``latent confounders'' that are beyond the\nimagination of experts. For example, users' rating on a song may depend on\ntheir current mood or the current weather, and users' preference on ice creams\nmay depend on the air temperature. Such latent confounders may be unobservable\nin the recorded training data. To solve the problem, we propose Deconfounded\nCausal Collaborative Filtering (DCCF). We first frame user behaviors with\nunobserved confounders into a causal graph, and then we design a front-door\nadjustment model carefully fused with machine learning to deconfound the\ninfluence of unobserved confounders. Experiments on real-world datasets show\nthat our method is able to deconfound unobserved confounders to achieve better\nrecommendation performance.\n","authors":["Shuyuan Xu","Juntao Tan","Shelby Heinecke","Jia Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2110.07122v2.pdf","comment":"Accepted by the ACM Transactions on Recommender Systems (TORS)"},{"id":"http://arxiv.org/abs/2102.01868v5","updated":"2023-08-14T15:58:36Z","published":"2021-02-03T04:16:11Z","title":"Causal Collaborative Filtering","summary":"  Many of the traditional recommendation algorithms are designed based on the\nfundamental idea of mining or learning correlative patterns from data to\nestimate the user-item correlative preference. However, pure correlative\nlearning may lead to Simpson's paradox in predictions, and thus results in\nsacrificed recommendation performance. Simpson's paradox is a well-known\nstatistical phenomenon, which causes confusions in statistical conclusions and\nignoring the paradox may result in inaccurate decisions. Fortunately, causal\nand counterfactual modeling can help us to think outside of the observational\ndata for user modeling and personalization so as to tackle such issues. In this\npaper, we propose Causal Collaborative Filtering (CCF) -- a general framework\nfor modeling causality in collaborative filtering and recommendation. We\nprovide a unified causal view of CF and mathematically show that many of the\ntraditional CF algorithms are actually special cases of CCF under simplified\ncausal graphs. We then propose a conditional intervention approach for\n$do$-operations so that we can estimate the user-item causal preference based\non the observational data. Finally, we further propose a general counterfactual\nconstrained learning framework for estimating the user-item preferences.\nExperiments are conducted on two types of real-world datasets -- traditional\nand randomized trial data -- and results show that our framework can improve\nthe recommendation performance and reduce the Simpson's paradox problem of many\nCF algorithms.\n","authors":["Shuyuan Xu","Yingqiang Ge","Yunqi Li","Zuohui Fu","Xu Chen","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2102.01868v5.pdf","comment":"Accepted by the 2023 ACM SIGIR International Conference on Theory of\n  Information Retrieval"},{"id":"http://arxiv.org/abs/2308.07223v1","updated":"2023-08-14T15:49:19Z","published":"2023-08-14T15:49:19Z","title":"Distance Matters For Improving Performance Estimation Under Covariate\n  Shift","summary":"  Performance estimation under covariate shift is a crucial component of safe\nAI model deployment, especially for sensitive use-cases. Recently, several\nsolutions were proposed to tackle this problem, most leveraging model\npredictions or softmax confidence to derive accuracy estimates. However, under\ndataset shifts, confidence scores may become ill-calibrated if samples are too\nfar from the training distribution. In this work, we show that taking into\naccount distances of test samples to their expected training distribution can\nsignificantly improve performance estimation under covariate shift. Precisely,\nwe introduce a \"distance-check\" to flag samples that lie too far from the\nexpected distribution, to avoid relying on their untrustworthy model outputs in\nthe accuracy estimation step. We demonstrate the effectiveness of this method\non 13 image classification tasks, across a wide-range of natural and synthetic\ndistribution shifts and hundreds of models, with a median relative MAE\nimprovement of 27% over the best baseline across all tasks, and SOTA\nperformance on 10 out of 13 tasks. Our code is publicly available at\nhttps://github.com/melanibe/distance_matters_performance_estimation.\n","authors":["Mélanie Roschewitz","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.07223v1.pdf","comment":"Accepted to ICCV Workshop on Uncertainty Quantification for Computer\n  Vision 2023"},{"id":"http://arxiv.org/abs/2308.07221v1","updated":"2023-08-14T15:47:25Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n  discrete acoustic codes","summary":"  We propose a method named AudioFormer, which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially, we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model, we generate discrete acoustic codes and utilize them to\ntrain a masked language model (MLM), thereby obtaining audio feature\nrepresentations. Furthermore, we pioneer the integration of a\n\\textbf{M}ulti-\\textbf{P}ositive sample \\textbf{C}ontrastive (MPC) learning\napproach. This method enables the learning of joint representations among\nmultiple discrete acoustic codes within the same audio input. In our\nexperiments, we treat discrete acoustic codes as textual data and train a\nmasked language model using a cloze-like methodology, ultimately deriving\nhigh-quality audio representations. Notably, the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets, and even outperforms\naudio-visual multimodal classification models on select datasets. Specifically,\nour approach achieves remarkable results on datasets including AudioSet (2M,\n20K), and FSD50K, with performance scores of 53.9, 45.1, and 65.6,\nrespectively. We have openly shared both the code and models:\n\\url{https://github.com/LZH-0225/AudioFormer.git}.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.07212v1","updated":"2023-08-14T15:29:32Z","published":"2023-08-14T15:29:32Z","title":"Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel\n  Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data","summary":"  Brain tumors remain a critical global health challenge, necessitating\nadvancements in diagnostic techniques and treatment methodologies. In response\nto the growing need for age-specific segmentation models, particularly for\npediatric patients, this study explores the deployment of deep learning\ntechniques using magnetic resonance imaging (MRI) modalities. By introducing a\nnovel ensemble approach using ONet and modified versions of UNet, coupled with\ninnovative loss functions, this study achieves a precise segmentation model for\nthe BraTS-PEDs 2023 Challenge. Data augmentation, including both single and\ncomposite transformations, ensures model robustness and accuracy across\ndifferent scanning protocols. The ensemble strategy, integrating the ONet and\nUNet models, shows greater effectiveness in capturing specific features and\nmodeling diverse aspects of the MRI images which result in lesion_wise dice\nscores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor\nlabels respectively. Visual comparisons further confirm the superiority of the\nensemble method in accurate tumor region coverage. The results indicate that\nthis advanced ensemble approach, building upon the unique strengths of\nindividual models, offers promising prospects for enhanced diagnostic accuracy\nand effective treatment planning for brain tumors in pediatric brains.\n","authors":["Shashidhar Reddy Javaji","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07212v1.pdf","comment":"3 Figs, 3 Tables"},{"id":"http://arxiv.org/abs/2308.07209v1","updated":"2023-08-14T15:25:07Z","published":"2023-08-14T15:25:07Z","title":"Unified Data-Free Compression: Pruning and Quantization without\n  Fine-Tuning","summary":"  Structured pruning and quantization are promising approaches for reducing the\ninference time and memory footprint of neural networks. However, most existing\nmethods require the original training dataset to fine-tune the model. This not\nonly brings heavy resource consumption but also is not possible for\napplications with sensitive or proprietary data due to privacy and security\nconcerns. Therefore, a few data-free methods are proposed to address this\nproblem, but they perform data-free pruning and quantization separately, which\ndoes not explore the complementarity of pruning and quantization. In this\npaper, we propose a novel framework named Unified Data-Free Compression(UDFC),\nwhich performs pruning and quantization simultaneously without any data and\nfine-tuning process. Specifically, UDFC starts with the assumption that the\npartial information of a damaged(e.g., pruned or quantized) channel can be\npreserved by a linear combination of other channels, and then derives the\nreconstruction form from the assumption to restore the information loss due to\ncompression. Finally, we formulate the reconstruction error between the\noriginal network and its compressed network, and theoretically deduce the\nclosed-form solution. We evaluate the UDFC on the large-scale image\nclassification task and obtain significant improvements over various network\narchitectures and compression methods. For example, we achieve a 20.54%\naccuracy improvement on ImageNet dataset compared to SOTA method with 30%\npruning ratio and 6-bit quantization on ResNet-34.\n","authors":["Shipeng Bai","Jun Chen","Xintian Shen","Yixuan Qian","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07209v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.07204v1","updated":"2023-08-14T15:16:39Z","published":"2023-08-14T15:16:39Z","title":"Algorithms for the Training of Neural Support Vector Machines","summary":"  Neural support vector machines (NSVMs) allow for the incorporation of domain\nknowledge in the design of the model architecture. In this article we introduce\na set of training algorithms for NSVMs that leverage the Pegasos algorithm and\nprovide a proof of concept by solving a set of standard machine learning tasks.\n","authors":["Lars Simon","Manuel Radons"],"pdf_url":"https://arxiv.org/pdf/2308.07204v1.pdf","comment":"19 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.07200v1","updated":"2023-08-14T15:10:29Z","published":"2023-08-14T15:10:29Z","title":"Neural Categorical Priors for Physics-Based Character Control","summary":"  Recent advances in learning reusable motion priors have demonstrated their\neffectiveness in generating naturalistic behaviors. In this paper, we propose a\nnew learning framework in this paradigm for controlling physics-based\ncharacters with significantly improved motion quality and diversity over\nexisting state-of-the-art methods. The proposed method uses reinforcement\nlearning (RL) to initially track and imitate life-like movements from\nunstructured motion clips using the discrete information bottleneck, as adopted\nin the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure\ncompresses the most relevant information from the motion clips into a compact\nyet informative latent space, i.e., a discrete space over vector quantized\ncodes. By sampling codes in the space from a trained categorical prior\ndistribution, high-quality life-like behaviors can be generated, similar to the\nusage of VQ-VAE in computer vision. Although this prior distribution can be\ntrained with the supervision of the encoder's output, it follows the original\nmotion clip distribution in the dataset and could lead to imbalanced behaviors\nin our setting. To address the issue, we further propose a technique named\nprior shifting to adjust the prior distribution using curiosity-driven RL. The\noutcome distribution is demonstrated to offer sufficient behavioral diversity\nand significantly facilitates upper-level policy learning for downstream tasks.\nWe conduct comprehensive experiments using humanoid characters on two\nchallenging downstream tasks, sword-shield striking and two-player boxing game.\nOur results demonstrate that the proposed framework is capable of controlling\nthe character to perform considerably high-quality movements in terms of\nbehavioral strategies, diversity, and realism. Videos, codes, and data are\navailable at https://tencent-roboticsx.github.io/NCP/.\n","authors":["Qingxu Zhu","He Zhang","Mengting Lan","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2308.07200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07198v1","updated":"2023-08-14T15:07:05Z","published":"2023-08-14T15:07:05Z","title":"Explaining Black-Box Models through Counterfactuals","summary":"  We present CounterfactualExplanations.jl: a package for generating\nCounterfactual Explanations (CE) and Algorithmic Recourse (AR) for black-box\nmodels in Julia. CE explain how inputs into a model need to change to yield\nspecific model predictions. Explanations that involve realistic and actionable\nchanges can be used to provide AR: a set of proposed actions for individuals to\nchange an undesirable outcome for the better. In this article, we discuss the\nusefulness of CE for Explainable Artificial Intelligence and demonstrate the\nfunctionality of our package. The package is straightforward to use and\ndesigned with a focus on customization and extensibility. We envision it to one\nday be the go-to place for explaining arbitrary predictive models in Julia\nthrough a diverse suite of counterfactual generators.\n","authors":["Patrick Altmeyer","Arie van Deursen","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2308.07198v1.pdf","comment":"13 pages, 9 figures, originally published in The Proceedings of the\n  JuliaCon Conferences (JCON)"},{"id":"http://arxiv.org/abs/2303.11908v2","updated":"2023-08-14T15:02:11Z","published":"2023-03-21T14:58:16Z","title":"Non-Asymptotic Pointwise and Worst-Case Bounds for Classical Spectrum\n  Estimators","summary":"  Spectrum estimation is a fundamental methodology in the analysis of\ntime-series data, with applications including medicine, speech analysis, and\ncontrol design. The asymptotic theory of spectrum estimation is\nwell-understood, but the theory is limited when the number of samples is fixed\nand finite. This paper gives non-asymptotic error bounds for a broad class of\nspectral estimators, both pointwise (at specific frequencies) and in the worst\ncase over all frequencies. The general method is used to derive error bounds\nfor the classical Blackman-Tukey, Bartlett, and Welch estimators. In\nparticular, these are first non-asymptotic error bounds for Bartlett and Welch\nestimators.\n","authors":["Andrew Lamperski"],"pdf_url":"https://arxiv.org/pdf/2303.11908v2.pdf","comment":"15 pages, 3 figures, under review in IEEE Transactions on Signal\n  Processing"},{"id":"http://arxiv.org/abs/2308.07192v1","updated":"2023-08-14T14:56:40Z","published":"2023-08-14T14:56:40Z","title":"gSASRec: Reducing Overconfidence in Sequential Recommendation Trained\n  with Negative Sampling","summary":"  A large catalogue size is one of the central challenges in training\nrecommendation models: a large number of items makes them memory and\ncomputationally inefficient to compute scores for all items during training,\nforcing these models to deploy negative sampling. However, negative sampling\nincreases the proportion of positive interactions in the training data, and\ntherefore models trained with negative sampling tend to overestimate the\nprobabilities of positive interactions a phenomenon we call overconfidence.\nWhile the absolute values of the predicted scores or probabilities are not\nimportant for the ranking of retrieved recommendations, overconfident models\nmay fail to estimate nuanced differences in the top-ranked items, resulting in\ndegraded performance. In this paper, we show that overconfidence explains why\nthe popular SASRec model underperforms when compared to BERT4Rec. This is\ncontrary to the BERT4Rec authors explanation that the difference in performance\nis due to the bi-directional attention mechanism. To mitigate overconfidence,\nwe propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and\ntheoretically prove that it can mitigate overconfidence. We further propose the\ngSASRec model, an improvement over SASRec that deploys an increased number of\nnegatives and the gBCE loss. We show through detailed experiments on three\ndatasets that gSASRec does not exhibit the overconfidence problem. As a result,\ngSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),\nwhile requiring less training time (e.g. -73% training time on MovieLens-1M).\nMoreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that\ncontain more than 1 million items.\n","authors":["Aleksandr Petrov","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2308.07192v1.pdf","comment":"Accepted at ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2305.07041v2","updated":"2023-08-14T14:47:34Z","published":"2023-05-11T14:25:34Z","title":"Fairness in Machine Learning meets with Equity in Healthcare","summary":"  With the growing utilization of machine learning in healthcare, there is\nincreasing potential to enhance healthcare outcomes. However, this also brings\nthe risk of perpetuating biases in data and model design that can harm certain\ndemographic groups based on factors such as age, gender, and race. This study\nproposes an artificial intelligence framework, grounded in software engineering\nprinciples, for identifying and mitigating biases in data and models while\nensuring fairness in healthcare settings. A case study is presented to\ndemonstrate how systematic biases in data can lead to amplified biases in model\npredictions, and machine learning methods are suggested to prevent such biases.\nFuture research aims to test and validate the proposed ML framework in\nreal-world clinical settings to evaluate its impact on promoting health equity.\n","authors":["Shaina Raza","Parisa Osivand Pour","Syed Raza Bashir"],"pdf_url":"https://arxiv.org/pdf/2305.07041v2.pdf","comment":"Accepted in Association for the Advancement of Artificial\n  Intelligence (AAAI) 2023 , Responsible Medical AI, Design, and\n  Operationalization Symposium"},{"id":"http://arxiv.org/abs/2308.07175v1","updated":"2023-08-14T14:32:42Z","published":"2023-08-14T14:32:42Z","title":"Efficient Learning of Quantum States Prepared With Few Non-Clifford\n  Gates II: Single-Copy Measurements","summary":"  Recent work has shown that $n$-qubit quantum states output by circuits with\nat most $t$ single-qubit non-Clifford gates can be learned to trace distance\n$\\epsilon$ using $\\mathsf{poly}(n,2^t,1/\\epsilon)$ time and samples. All prior\nalgorithms achieving this runtime use entangled measurements across two copies\nof the input state. In this work, we give a similarly efficient algorithm that\nlearns the same class of states using only single-copy measurements.\n","authors":["Sabee Grewal","Vishnu Iyer","William Kretschmer","Daniel Liang"],"pdf_url":"https://arxiv.org/pdf/2308.07175v1.pdf","comment":"22 pages. arXiv admin note: text overlap with arXiv:2305.13409"},{"id":"http://arxiv.org/abs/2308.07170v1","updated":"2023-08-14T14:26:52Z","published":"2023-08-14T14:26:52Z","title":"PitchNet: A Fully Convolutional Neural Network for Pitch Estimation","summary":"  In the domain of music and sound processing, pitch extraction plays a pivotal\nrole. This research introduces \"PitchNet\", a convolutional neural network\ntailored for pitch extraction from the human singing voice, including acapella\nperformances. Integrating autocorrelation with deep learning techniques,\nPitchNet aims to optimize the accuracy of pitch detection. Evaluation across\ndatasets comprising synthetic sounds, opera recordings, and time-stretched\nvowels demonstrates its efficacy. This work paves the way for enhanced pitch\nextraction in both music and voice settings.\n","authors":["Jeremy Cochoy"],"pdf_url":"https://arxiv.org/pdf/2308.07170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01464v2","updated":"2023-08-14T14:14:41Z","published":"2023-03-02T18:27:00Z","title":"Efficient Rate Optimal Regret for Adversarial Contextual MDPs Using\n  Online Function Approximation","summary":"  We present the OMG-CMDP! algorithm for regret minimization in adversarial\nContextual MDPs. The algorithm operates under the minimal assumptions of\nrealizable function class and access to online least squares and log loss\nregression oracles. Our algorithm is efficient (assuming efficient online\nregression oracles), simple and robust to approximation errors. It enjoys an\n$\\widetilde{O}(H^{2.5} \\sqrt{ T|S||A| ( \\mathcal{R}(\\mathcal{O}) + H\n\\log(\\delta^{-1}) )})$ regret guarantee, with $T$ being the number of episodes,\n$S$ the state space, $A$ the action space, $H$ the horizon and\n$\\mathcal{R}(\\mathcal{O}) = \\mathcal{R}(\\mathcal{O}_{\\mathrm{sq}}^\\mathcal{F})\n+ \\mathcal{R}(\\mathcal{O}_{\\mathrm{log}}^\\mathcal{P})$ is the sum of the\nregression oracles' regret, used to approximate the context-dependent rewards\nand dynamics, respectively. To the best of our knowledge, our algorithm is the\nfirst efficient rate optimal regret minimization algorithm for adversarial\nCMDPs that operates under the minimal standard assumption of online function\napproximation.\n","authors":["Orin Levy","Alon Cohen","Asaf Cassel","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2303.01464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.11644v3","updated":"2023-08-14T14:14:05Z","published":"2021-08-26T08:23:32Z","title":"Hybrid quantum-classical machine learning for generative chemistry and\n  drug design","summary":"  Deep generative chemistry models emerge as powerful tools to expedite drug\ndiscovery. However, the immense size and complexity of the structural space of\nall possible drug-like molecules pose significant obstacles, which could be\novercome with hybrid architectures combining quantum computers with deep\nclassical networks. As the first step toward this goal, we built a compact\ndiscrete variational autoencoder (DVAE) with a Restricted Boltzmann Machine\n(RBM) of reduced size in its latent layer. The size of the proposed model was\nsmall enough to fit on a state-of-the-art D-Wave quantum annealer and allowed\ntraining on a subset of the ChEMBL dataset of biologically active compounds.\nFinally, we generated 2331 novel chemical structures with medicinal chemistry\nand synthetic accessibility properties in the ranges typical for molecules from\nChEMBL. The presented results demonstrate the feasibility of using already\nexisting or soon-to-be-available quantum computing devices as testbeds for\nfuture drug discovery applications.\n","authors":["A. I. Gircha","A. S. Boev","K. Avchaciov","P. O. Fedichev","A. K. Fedorov"],"pdf_url":"https://arxiv.org/pdf/2108.11644v3.pdf","comment":"8 pages. 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2305.17330v2","updated":"2023-08-14T13:48:38Z","published":"2023-05-27T02:14:09Z","title":"MADiff: Offline Multi-agent Learning with Diffusion Models","summary":"  Diffusion model (DM), as a powerful generative model, recently achieved huge\nsuccess in various scenarios including offline reinforcement learning, where\nthe policy learns to conduct planning by generating trajectory in the online\nevaluation. However, despite the effectiveness shown for single-agent learning,\nit remains unclear how DMs can operate in multi-agent problems, where agents\ncan hardly complete teamwork without good coordination by independently\nmodeling each agent's trajectories. In this paper, we propose MADiff, a novel\ngenerative multi-agent learning framework to tackle this problem. MADiff is\nrealized with an attention-based diffusion model to model the complex\ncoordination among behaviors of multiple diffusion agents. To the best of our\nknowledge, MADiff is the first diffusion-based multi-agent offline RL\nframework, which behaves as both a decentralized policy and a centralized\ncontroller, which includes opponent modeling and can be used for multi-agent\ntrajectory prediction. MADiff takes advantage of the powerful generative\nability of diffusion while well-suited in modeling complex multi-agent\ninteractions. Our experiments show the superior performance of MADiff compared\nto baseline algorithms in a range of multi-agent learning tasks.\n","authors":["Zhengbang Zhu","Minghuan Liu","Liyuan Mao","Bingyi Kang","Minkai Xu","Yong Yu","Stefano Ermon","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17330v2.pdf","comment":"17 pages, 7 figures, 4 tables. The first two authors contributed\n  equally to the work"},{"id":"http://arxiv.org/abs/2308.07136v1","updated":"2023-08-14T13:42:09Z","published":"2023-08-14T13:42:09Z","title":"Pairing interacting protein sequences using masked language modeling","summary":"  Predicting which proteins interact together from amino-acid sequences is an\nimportant task. We develop a method to pair interacting protein sequences which\nleverages the power of protein language models trained on multiple sequence\nalignments, such as MSA Transformer and the EvoFormer module of AlphaFold. We\nformulate the problem of pairing interacting partners among the paralogs of two\nprotein families in a differentiable way. We introduce a method called DiffPALM\nthat solves it by exploiting the ability of MSA Transformer to fill in masked\namino acids in multiple sequence alignments using the surrounding context. MSA\nTransformer encodes coevolution between functionally or structurally coupled\namino acids. We show that it captures inter-chain coevolution, while it was\ntrained on single-chain data, which means that it can be used\nout-of-distribution. Relying on MSA Transformer without fine-tuning, DiffPALM\noutperforms existing coevolution-based pairing methods on difficult benchmarks\nof shallow multiple sequence alignments extracted from ubiquitous prokaryotic\nprotein datasets. It also outperforms an alternative method based on a\nstate-of-the-art protein language model trained on single sequences. Paired\nalignments of interacting protein sequences are a crucial ingredient of\nsupervised deep learning methods to predict the three-dimensional structure of\nprotein complexes. DiffPALM substantially improves the structure prediction of\nsome eukaryotic protein complexes by AlphaFold-Multimer, without significantly\ndeteriorating any of those we tested. It also achieves competitive performance\nwith using orthology-based pairing.\n","authors":["Umberto Lupo","Damiano Sgarbossa","Anne-Florence Bitbol"],"pdf_url":"https://arxiv.org/pdf/2308.07136v1.pdf","comment":"33 pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":"  The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07126v1","updated":"2023-08-14T13:13:50Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":"  Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07121v1","updated":"2023-08-14T13:06:10Z","published":"2023-08-14T13:06:10Z","title":"Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with\n  Transformers","summary":"  We propose a shift towards end-to-end learning in bird sound monitoring by\ncombining self-supervised (SSL) and deep active learning (DAL). Leveraging\ntransformer models, we aim to bypass traditional spectrogram conversions,\nenabling direct raw audio processing. ActiveBird2Vec is set to generate\nhigh-quality bird sound representations through SSL, potentially accelerating\nthe assessment of environmental changes and decision-making processes for wind\nfarms. Additionally, we seek to utilize the wide variety of bird vocalizations\nthrough DAL, reducing the reliance on extensively labeled datasets by human\nexperts. We plan to curate a comprehensive set of tasks through Huggingface\nDatasets, enhancing future comparability and reproducibility of bioacoustic\nresearch. A comparative analysis between various transformer models will be\nconducted to evaluate their proficiency in bird sound recognition tasks. We aim\nto accelerate the progression of avian bioacoustic research and contribute to\nmore effective conservation strategies.\n","authors":["Lukas Rauch","Raphael Schwinger","Moritz Wirth","Bernhard Sick","Sven Tomforde","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.07121v1.pdf","comment":"Accepted @AI4S ECAI2023. This is the author's version of the work"},{"id":"http://arxiv.org/abs/2308.07118v1","updated":"2023-08-14T12:57:12Z","published":"2023-08-14T12:57:12Z","title":"Neural radiance fields in the industrial and robotics domain:\n  applications, research opportunities and use cases","summary":"  The proliferation of technologies, such as extended reality (XR), has\nincreased the demand for high-quality three-dimensional (3D) graphical\nrepresentations. Industrial 3D applications encompass computer-aided design\n(CAD), finite element analysis (FEA), scanning, and robotics. However, current\nmethods employed for industrial 3D representations suffer from high\nimplementation costs and reliance on manual human input for accurate 3D\nmodeling. To address these challenges, neural radiance fields (NeRFs) have\nemerged as a promising approach for learning 3D scene representations based on\nprovided training 2D images. Despite a growing interest in NeRFs, their\npotential applications in various industrial subdomains are still unexplored.\nIn this paper, we deliver a comprehensive examination of NeRF industrial\napplications while also providing direction for future research endeavors. We\nalso present a series of proof-of-concept experiments that demonstrate the\npotential of NeRFs in the industrial domain. These experiments include\nNeRF-based video compression techniques and using NeRFs for 3D motion\nestimation in the context of collision avoidance. In the video compression\nexperiment, our results show compression savings up to 48\\% and 74\\% for\nresolutions of 1920x1080 and 300x168, respectively. The motion estimation\nexperiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF)\nand achieved an average disparity map PSNR of 23 dB and an SSIM of 0.97. The\ncode for our experiments is publicly available at\nhttps://github.com/Maftej/iisnerf .\n","authors":["Eugen Šlapak","Enric Pardo","Matúš Dopiriak","Taras Maksymyuk","Juraj Gazda"],"pdf_url":"https://arxiv.org/pdf/2308.07118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01050v2","updated":"2023-08-14T12:57:04Z","published":"2023-08-02T09:48:08Z","title":"A Counterfactual Safety Margin Perspective on the Scoring of Autonomous\n  Vehicles' Riskiness","summary":"  Autonomous Vehicles (AVs) have the potential to provide numerous societal\nbenefits, such as decreased road accidents and increased overall transportation\nefficiency. However, quantifying the risk associated with AVs is challenging\ndue to the lack of historical data and the rapidly evolving technology. This\npaper presents a data-driven framework for comparing the risk of different AVs'\nbehaviors in various operational design domains (ODDs), based on counterfactual\nsimulations of \"misbehaving\" road users. We introduce the concept of\ncounterfactual safety margin, which represents the minimum deviation from\nnormal behavior that could lead to a collision. This concept helps to find the\nmost critical scenarios but also to assess the frequency and severity of risk\nof AVs. We show that the proposed methodology is applicable even when the AV's\nbehavioral policy is unknown -- through worst- and best-case analyses -- making\nthe method useful also to external third-party risk assessors. Our experimental\nresults demonstrate the correlation between the safety margin, the driving\npolicy quality, and the ODD shedding light on the relative risk associated with\ndifferent AV providers. This work contributes to AV safety assessment and aids\nin addressing legislative and insurance concerns surrounding this emerging\ntechnology.\n","authors":["Alessandro Zanardi","Andrea Censi","Margherita Atzei","Luigi Di Lillo","Emilio Frazzoli"],"pdf_url":"https://arxiv.org/pdf/2308.01050v2.pdf","comment":"updated affiliations"},{"id":"http://arxiv.org/abs/2308.07117v1","updated":"2023-08-14T12:56:31Z","published":"2023-08-14T12:56:31Z","title":"iSTFTNet2: Faster and More Lightweight iSTFT-Based Neural Vocoder Using\n  1D-2D CNN","summary":"  The inverse short-time Fourier transform network (iSTFTNet) has garnered\nattention owing to its fast, lightweight, and high-fidelity speech synthesis.\nIt obtains these characteristics using a fast and lightweight 1D CNN as the\nbackbone and replacing some neural processes with iSTFT. Owing to the\ndifficulty of a 1D CNN to model high-dimensional spectrograms, the frequency\ndimension is reduced via temporal upsampling. However, this strategy\ncompromises the potential to enhance the speed. Therefore, we propose\niSTFTNet2, an improved variant of iSTFTNet with a 1D-2D CNN that employs 1D and\n2D CNNs to model temporal and spectrogram structures, respectively. We designed\na 2D CNN that performs frequency upsampling after conversion in a few-frequency\nspace. This design facilitates the modeling of high-dimensional spectrograms\nwithout compromising the speed. The results demonstrated that iSTFTNet2 made\niSTFTNet faster and more lightweight with comparable speech quality. Audio\nsamples are available at\nhttps://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/.\n","authors":["Takuhiro Kaneko","Hirokazu Kameoka","Kou Tanaka","Shogo Seki"],"pdf_url":"https://arxiv.org/pdf/2308.07117v1.pdf","comment":"Accepted to Interspeech 2023. Project page:\n  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/"},{"id":"http://arxiv.org/abs/2308.01138v2","updated":"2023-08-14T12:37:37Z","published":"2023-08-02T13:29:31Z","title":"Can We Transfer Noise Patterns? A Multi-environment Spectrum Analysis\n  Model Using Generated Cases","summary":"  Spectrum analysis systems in online water quality testing are designed to\ndetect types and concentrations of pollutants and enable regulatory agencies to\nrespond promptly to pollution incidents. However, spectral data-based testing\ndevices suffer from complex noise patterns when deployed in non-laboratory\nenvironments. To make the analysis model applicable to more environments, we\npropose a noise patterns transferring model, which takes the spectrum of\nstandard water samples in different environments as cases and learns the\ndifferences in their noise patterns, thus enabling noise patterns to transfer\nto unknown samples. Unfortunately, the inevitable sample-level baseline noise\nmakes the model unable to obtain the paired data that only differ in\ndataset-level environmental noise. To address the problem, we generate a\nsample-to-sample case-base to exclude the interference of sample-level noise on\ndataset-level noise learning, enhancing the system's learning performance.\nExperiments on spectral data with different background noises demonstrate the\ngood noise-transferring ability of the proposed method against baseline systems\nranging from wavelet denoising, deep neural networks, and generative models.\nFrom this research, we posit that our method can enhance the performance of DL\nmodels by generating high-quality cases. The source code is made publicly\navailable online at https://github.com/Magnomic/CNST.\n","authors":["Haiwen Du","Zheng Ju","Yu An","Honghui Du","Dongjie Zhu","Zhaoshuo Tian","Aonghus Lawlor","Ruihai Dong"],"pdf_url":"https://arxiv.org/pdf/2308.01138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12334v2","updated":"2023-08-14T12:37:30Z","published":"2023-02-23T20:57:47Z","title":"Using Automated Algorithm Configuration for Parameter Control","summary":"  Dynamic Algorithm Configuration (DAC) tackles the question of how to\nautomatically learn policies to control parameters of algorithms in a\ndata-driven fashion. This question has received considerable attention from the\nevolutionary community in recent years. Having a good benchmark collection to\ngain structural understanding on the effectiveness and limitations of different\nsolution methods for DAC is therefore strongly desirable. Following recent work\non proposing DAC benchmarks with well-understood theoretical properties and\nground truth information, in this work, we suggest as a new DAC benchmark the\ncontrolling of the key parameter $\\lambda$ in the\n$(1+(\\lambda,\\lambda))$~Genetic Algorithm for solving OneMax problems. We\nconduct a study on how to solve the DAC problem via the use of (static)\nautomated algorithm configuration on the benchmark, and propose techniques to\nsignificantly improve the performance of the approach. Our approach is able to\nconsistently outperform the default parameter control policy of the benchmark\nderived from previous theoretical work on sufficiently large problem sizes. We\nalso present new findings on the landscape of the parameter-control search\npolicies and propose methods to compute stronger baselines for the benchmark\nvia numerical approximations of the true optimal policies.\n","authors":["Deyao Chen","Maxim Buzdalov","Carola Doerr","Nguyen Dang"],"pdf_url":"https://arxiv.org/pdf/2302.12334v2.pdf","comment":"To appear in the Proc. of the ACM/SIGEVO Conference on Foundations of\n  Genetic Algorithms (FOGA XVII)"},{"id":"http://arxiv.org/abs/2305.09241v3","updated":"2023-08-14T12:35:57Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n  Unexploitable Data with Learnable Examples","summary":"  Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12398v3","updated":"2023-08-14T11:47:41Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n  Transformers","summary":"  Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07074v1","updated":"2023-08-14T11:16:28Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Diversity and Complexity Analysis","summary":"  Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07061v1","updated":"2023-08-14T10:45:51Z","published":"2023-08-14T10:45:51Z","title":"Machine Unlearning: Solutions and Challenges","summary":"  Machine learning models may inadvertently memorize sensitive, unauthorized,\nor malicious data, posing risks of privacy violations, security breaches, and\nperformance deterioration. To address these issues, machine unlearning has\nemerged as a critical technique to selectively remove specific training data\npoints' influence on trained models. This paper provides a comprehensive\ntaxonomy and analysis of machine unlearning research. We categorize existing\nresearch into exact unlearning that algorithmically removes data influence\nentirely and approximate unlearning that efficiently minimizes influence\nthrough limited parameter updates. By reviewing the state-of-the-art solutions,\nwe critically discuss their advantages and limitations. Furthermore, we propose\nfuture directions to advance machine unlearning and establish it as an\nessential capability for trustworthy and adaptive machine learning. This paper\nprovides researchers with a roadmap of open problems, encouraging impactful\ncontributions to address real-world needs for selective data removal.\n","authors":["Jie Xu","Zihan Wu","Cong Wang","Xiaohua Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07052v1","updated":"2023-08-14T10:23:25Z","published":"2023-08-14T10:23:25Z","title":"Diagnosis of Scalp Disorders using Machine Learning and Deep Learning\n  Approach -- A Review","summary":"  The morbidity of scalp diseases is minuscule compared to other diseases, but\nthe impact on the patient's life is enormous. It is common for people to\nexperience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,\nAlopecia and Atopic-Dermatitis. In accordance with WHO research, approximately\n70% of adults have problems with their scalp. It has been demonstrated in\ndescriptive research that hair quality is impaired by impaired scalp, but these\nimpacts are reversible with early diagnosis and treatment. Deep Learning\nadvances have demonstrated the effectiveness of CNN paired with FCN in\ndiagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp\ninspection and diagnosis system, an imaging microscope and a trained model are\ncombined with an app that classifies scalp disorders accurately with an average\nprecision of 97.41%- 99.09%. Another research dealt with classifying the\nPsoriasis using the CNN with an accuracy of 82.9%. As part of another study, an\nML based algorithm was also employed. It accurately classified the healthy\nscalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN\nalgorithms. Using deep learning models to diagnose scalp related diseases has\nimproved due to advancements i computation capabilities and computer vision,\nbut there remains a wide horizon for further improvements.\n","authors":["Hrishabh Tiwari","Jatin Moolchandani","Shamla Mantri"],"pdf_url":"https://arxiv.org/pdf/2308.07052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07051v1","updated":"2023-08-14T10:22:51Z","published":"2023-08-14T10:22:51Z","title":"Fourier neural operator for learning solutions to macroscopic traffic\n  flow models: Application to the forward and inverse problems","summary":"  Deep learning methods are emerging as popular computational tools for solving\nforward and inverse problems in traffic flow. In this paper, we study a neural\noperator framework for learning solutions to nonlinear hyperbolic partial\ndifferential equations with applications in macroscopic traffic flow models. In\nthis framework, an operator is trained to map heterogeneous and sparse traffic\ninput data to the complete macroscopic traffic state in a supervised learning\nsetting. We chose a physics-informed Fourier neural operator ($\\pi$-FNO) as the\noperator, where an additional physics loss based on a discrete conservation law\nregularizes the problem during training to improve the shock predictions. We\nalso propose to use training data generated from random piecewise constant\ninput data to systematically capture the shock and rarefied solutions. From\nexperiments using the LWR traffic flow model, we found superior accuracy in\npredicting the density dynamics of a ring-road network and urban signalized\nroad. We also found that the operator can be trained using simple traffic\ndensity dynamics, e.g., consisting of $2-3$ vehicle queues and $1-2$ traffic\nsignal cycles, and it can predict density dynamics for heterogeneous vehicle\nqueue distributions and multiple traffic signal cycles $(\\geq 2)$ with an\nacceptable error. The extrapolation error grew sub-linearly with input\ncomplexity for a proper choice of the model architecture and training data.\nAdding a physics regularizer aided in learning long-term traffic density\ndynamics, especially for problems with periodic boundary data.\n","authors":["Bilal Thonnam Thodi","Sai Venkata Ramana Ambadipudi","Saif Eddin Jabari"],"pdf_url":"https://arxiv.org/pdf/2308.07051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07048v1","updated":"2023-08-14T10:18:24Z","published":"2023-08-14T10:18:24Z","title":"UIPC-MF: User-Item Prototype Connection Matrix Factorization for\n  Explainable Collaborative Filtering","summary":"  Recommending items to potentially interested users has been an important\ncommercial task that faces two main challenges: accuracy and explainability.\nWhile most collaborative filtering models rely on statistical computations on a\nlarge scale of interaction data between users and items and can achieve high\nperformance, they often lack clear explanatory power. We propose UIPC-MF, a\nprototype-based matrix factorization method for explainable collaborative\nfiltering recommendations. In UIPC-MF, both users and items are associated with\nsets of prototypes, capturing general collaborative attributes. To enhance\nexplainability, UIPC-MF learns connection weights that reflect the associative\nrelations between user and item prototypes for recommendations. UIPC-MF\noutperforms other prototype-based baseline methods in terms of Hit Ratio and\nNormalized Discounted Cumulative Gain on three datasets, while also providing\nbetter transparency.\n","authors":["Lei Pan","Von-Wun Soo"],"pdf_url":"https://arxiv.org/pdf/2308.07048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07047v1","updated":"2023-08-14T10:16:12Z","published":"2023-08-14T10:16:12Z","title":"No Regularization is Needed: An Efficient and Effective Model for\n  Incomplete Label Distribution Learning","summary":"  Label Distribution Learning (LDL) assigns soft labels, a.k.a. degrees, to a\nsample. In reality, it is always laborious to obtain complete degrees, giving\nbirth to the Incomplete LDL (InLDL). However, InLDL often suffers from\nperformance degeneration. To remedy it, existing methods need one or more\nexplicit regularizations, leading to burdensome parameter tuning and extra\ncomputation. We argue that label distribution itself may provide useful prior,\nwhen used appropriately, the InLDL problem can be solved without any explicit\nregularization. In this paper, we offer a rational alternative to use such a\nprior. Our intuition is that large degrees are likely to get more concern, the\nsmall ones are easily overlooked, whereas the missing degrees are completely\nneglected in InLDL. To learn an accurate label distribution, it is crucial not\nto ignore the small observed degrees but to give them properly large weights,\nwhile gradually increasing the weights of the missing degrees. To this end, we\nfirst define a weighted empirical risk and derive upper bounds between the\nexpected risk and the weighted empirical risk, which reveals in principle that\nweighting plays an implicit regularization role. Then, by using the prior of\ndegrees, we design a weighted scheme and verify its effectiveness. To sum up,\nour model has four advantages, it is 1) model selection free, as no explicit\nregularization is imposed; 2) with closed form solution (sub-problem) and\neasy-to-implement (a few lines of codes); 3) with linear computational\ncomplexity in the number of samples, thus scalable to large datasets; 4)\ncompetitive with state-of-the-arts even without any explicit regularization.\n","authors":["Xiang Li","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07047v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2011.05885v2","updated":"2023-08-14T10:15:17Z","published":"2020-11-11T16:25:45Z","title":"Leveraged Matrix Completion with Noise","summary":"  Completing low-rank matrices from subsampled measurements has received much\nattention in the past decade. Existing works indicate that\n$\\mathcal{O}(nr\\log^2(n))$ datums are required to theoretically secure the\ncompletion of an $n \\times n$ noisy matrix of rank $r$ with high probability,\nunder some quite restrictive assumptions: (1) the underlying matrix must be\nincoherent; (2) observations follow the uniform distribution. The\nrestrictiveness is partially due to ignoring the roles of the leverage score\nand the oracle information of each element. In this paper, we employ the\nleverage scores to characterize the importance of each element and\nsignificantly relax assumptions to: (1) not any other structure assumptions are\nimposed on the underlying low-rank matrix; (2) elements being observed are\nappropriately dependent on their importance via the leverage score. Under these\nassumptions, instead of uniform sampling, we devise an ununiform/biased\nsampling procedure that can reveal the ``importance'' of each observed element.\nOur proofs are supported by a novel approach that phrases sufficient optimality\nconditions based on the Golfing Scheme, which would be of independent interest\nto the wider areas. Theoretical findings show that we can provably recover an\nunknown $n\\times n$ matrix of rank $r$ from just about $\\mathcal{O}(nr\\log^2\n(n))$ entries, even when the observed entries are corrupted with a small amount\nof noisy information. The empirical results align precisely with our theories.\n","authors":["Xinjian Huang","Weiwei Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2011.05885v2.pdf","comment":"This manuscript has been accepted for publication as a regular paper\n  in the IEEE Transactions on Cybernetics"},{"id":"http://arxiv.org/abs/2308.07037v1","updated":"2023-08-14T09:56:35Z","published":"2023-08-14T09:56:35Z","title":"Bayesian Flow Networks","summary":"  This paper introduces Bayesian Flow Networks (BFNs), a new class of\ngenerative model in which the parameters of a set of independent distributions\nare modified with Bayesian inference in the light of noisy data samples, then\npassed as input to a neural network that outputs a second, interdependent\ndistribution. Starting from a simple prior and iteratively updating the two\ndistributions yields a generative procedure similar to the reverse process of\ndiffusion models; however it is conceptually simpler in that no forward process\nis required. Discrete and continuous-time loss functions are derived for\ncontinuous, discretised and discrete data, along with sample generation\nprocedures. Notably, the network inputs for discrete data lie on the\nprobability simplex, and are therefore natively differentiable, paving the way\nfor gradient-based sample guidance and few-step generation in discrete domains\nsuch as language modelling. The loss function directly optimises data\ncompression and places no restrictions on the network architecture. In our\nexperiments BFNs achieve competitive log-likelihoods for image modelling on\ndynamically binarized MNIST and CIFAR-10, and outperform all known discrete\ndiffusion models on the text8 character-level language modelling task.\n","authors":["Alex Graves","Rupesh Kumar Srivastava","Timothy Atkinson","Faustino Gomez"],"pdf_url":"https://arxiv.org/pdf/2308.07037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07032v1","updated":"2023-08-14T09:45:28Z","published":"2023-08-14T09:45:28Z","title":"S3IM: Stochastic Structural SIMilarity and Its Unreasonable\n  Effectiveness for Neural Fields","summary":"  Recently, Neural Radiance Field (NeRF) has shown great success in rendering\nnovel-view images of a given scene by learning an implicit representation with\nonly posed RGB images. NeRF and relevant neural field methods (e.g., neural\nsurface representation) typically optimize a point-wise loss and make\npoint-wise predictions, where one data point corresponds to one pixel.\nUnfortunately, this line of research failed to use the collective supervision\nof distant pixels, although it is known that pixels in an image or scene can\nprovide rich structural information. To the best of our knowledge, we are the\nfirst to design a nonlocal multiplex training paradigm for NeRF and relevant\nneural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss\nthat processes multiple data points as a whole set instead of process multiple\ninputs independently. Our extensive experiments demonstrate the unreasonable\neffectiveness of S3IM in improving NeRF and neural surface representation for\nnearly free. The improvements of quality metrics can be particularly\nsignificant for those relatively difficult tasks: e.g., the test MSE loss\nunexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view\nsynthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance\nreduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is\nconsistently robust even with sparse inputs, corrupted images, and dynamic\nscenes.\n","authors":["Zeke Xie","Xindi Yang","Yujie Yang","Qi Sun","Yixiang Jiang","Haoran Wang","Yunfeng Cai","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07032v1.pdf","comment":"ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14\n  pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2303.01664v2","updated":"2023-08-14T09:22:18Z","published":"2023-03-03T01:57:16Z","title":"Miipher: A Robust Speech Restoration Model Integrating Self-Supervised\n  Speech and Text Representations","summary":"  Speech restoration (SR) is a task of converting degraded speech signals into\nhigh-quality ones. In this study, we propose a robust SR model called Miipher,\nand apply Miipher to a new SR application: increasing the amount of\nhigh-quality training data for speech generation by converting speech samples\ncollected from the Web to studio-quality. To make our SR model robust against\nvarious degradation, we use (i) a speech representation extracted from w2v-BERT\nfor the input feature, and (ii) a text representation extracted from\ntranscripts via PnG-BERT as a linguistic conditioning feature. Experiments show\nthat Miipher (i) is robust against various audio degradation and (ii) enable us\nto train a high-quality text-to-speech (TTS) model from restored speech samples\ncollected from the Web. Audio samples are available at our demo page:\ngoogle.github.io/df-conformer/miipher/\n","authors":["Yuma Koizumi","Heiga Zen","Shigeki Karita","Yifan Ding","Kohei Yatabe","Nobuyuki Morioka","Yu Zhang","Wei Han","Ankur Bapna","Michiel Bacchiani"],"pdf_url":"https://arxiv.org/pdf/2303.01664v2.pdf","comment":"Accepted to WASPAA 2023"},{"id":"http://arxiv.org/abs/2308.03330v2","updated":"2023-08-14T09:16:34Z","published":"2023-08-07T06:23:24Z","title":"Expediting Neural Network Verification via Network Reduction","summary":"  A wide range of verification methods have been proposed to verify the safety\nproperties of deep neural networks ensuring that the networks function\ncorrectly in critical applications. However, many well-known verification tools\nstill struggle with complicated network architectures and large network sizes.\nIn this work, we propose a network reduction technique as a pre-processing\nmethod prior to verification. The proposed method reduces neural networks via\neliminating stable ReLU neurons, and transforming them into a sequential neural\nnetwork consisting of ReLU and Affine layers which can be handled by the most\nverification tools. We instantiate the reduction technique on the\nstate-of-the-art complete and incomplete verification tools, including\nalpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of\nbenchmarks indicate that the proposed technique can significantly reduce neural\nnetworks and speed up existing verification tools. Furthermore, the experiment\nresults also show that network reduction can improve the availability of\nexisting verification tools on many networks by reducing them into sequential\nneural networks.\n","authors":["Yuyi Zhong","Ruiwei Wang","Siau-Cheng Khoo"],"pdf_url":"https://arxiv.org/pdf/2308.03330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13926v2","updated":"2023-08-14T09:04:14Z","published":"2023-06-24T10:21:11Z","title":"Graph Neural Networks Provably Benefit from Structural Information: A\n  Feature Learning Perspective","summary":"  Graph neural networks (GNNs) have pioneered advancements in graph\nrepresentation learning, exhibiting superior feature learning and performance\nover multilayer perceptrons (MLPs) when handling graph inputs. However,\nunderstanding the feature learning aspect of GNNs is still in its initial\nstage. This study aims to bridge this gap by investigating the role of graph\nconvolution within the context of feature learning theory in neural networks\nusing gradient descent training. We provide a distinct characterization of\nsignal learning and noise memorization in two-layer graph convolutional\nnetworks (GCNs), contrasting them with two-layer convolutional neural networks\n(CNNs). Our findings reveal that graph convolution significantly augments the\nbenign overfitting regime over the counterpart CNNs, where signal learning\nsurpasses noise memorization, by approximately factor $\\sqrt{D}^{q-2}$, with\n$D$ denoting a node's expected degree and $q$ being the power of the ReLU\nactivation function where $q > 2$. These findings highlight a substantial\ndiscrepancy between GNNs and MLPs in terms of feature learning and\ngeneralization capacity after gradient descent training, a conclusion further\nsubstantiated by our empirical simulations.\n","authors":["Wei Huang","Yuan Cao","Haonan Wang","Xin Cao","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2306.13926v2.pdf","comment":"33 pages, 7 figures. We have provided a clearer roadmap"},{"id":"http://arxiv.org/abs/2308.07013v1","updated":"2023-08-14T09:00:58Z","published":"2023-08-14T09:00:58Z","title":"Learning to Optimize LSM-trees: Towards A Reinforcement Learning based\n  Key-Value Store for Dynamic Workloads","summary":"  LSM-trees are widely adopted as the storage backend of key-value stores.\nHowever, optimizing the system performance under dynamic workloads has not been\nsufficiently studied or evaluated in previous work. To fill the gap, we present\nRusKey, a key-value store with the following new features: (1) RusKey is a\nfirst attempt to orchestrate LSM-tree structures online to enable robust\nperformance under the context of dynamic workloads; (2) RusKey is the first\nstudy to use Reinforcement Learning (RL) to guide LSM-tree transformations; (3)\nRusKey includes a new LSM-tree design, named FLSM-tree, for an efficient\ntransition between different compaction policies -- the bottleneck of dynamic\nkey-value stores. We justify the superiority of the new design with theoretical\nanalysis; (4) RusKey requires no prior workload knowledge for system\nadjustment, in contrast to state-of-the-art techniques. Experiments show that\nRusKey exhibits strong performance robustness in diverse workloads, achieving\nup to 4x better end-to-end performance than the RocksDB system under various\nsettings.\n","authors":["Dingheng Mo","Fanchao Chen","Siqiang Luo","Caihua Shan"],"pdf_url":"https://arxiv.org/pdf/2308.07013v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.07012v1","updated":"2023-08-14T08:59:59Z","published":"2023-08-14T08:59:59Z","title":"Greedy online change point detection","summary":"  Standard online change point detection (CPD) methods tend to have large false\ndiscovery rates as their detections are sensitive to outliers. To overcome this\ndrawback, we propose Greedy Online Change Point Detection (GOCPD), a\ncomputationally appealing method which finds change points by maximizing the\nprobability of the data coming from the (temporal) concatenation of two\nindependent models. We show that, for time series with a single change point,\nthis objective is unimodal and thus CPD can be accelerated via ternary search\nwith logarithmic complexity. We demonstrate the effectiveness of GOCPD on\nsynthetic data and validate our findings on real-world univariate and\nmultivariate settings.\n","authors":["Jou-Hui Ho","Felipe Tobar"],"pdf_url":"https://arxiv.org/pdf/2308.07012v1.pdf","comment":"Accepted at IEEE MLSP 2023"},{"id":"http://arxiv.org/abs/2105.10377v4","updated":"2023-08-14T08:26:31Z","published":"2021-05-21T14:36:39Z","title":"Adaptive Filters in Graph Convolutional Neural Networks","summary":"  Over the last few years, we have witnessed the availability of an increasing\ndata generated from non-Euclidean domains, which are usually represented as\ngraphs with complex relationships, and Graph Neural Networks (GNN) have gained\na high interest because of their potential in processing graph-structured data.\nIn particular, there is a strong interest in exploring the possibilities in\nperforming convolution on graphs using an extension of the GNN architecture,\ngenerally referred to as Graph Convolutional Neural Networks (ConvGNN).\nConvolution on graphs has been achieved mainly in two forms: spectral and\nspatial convolutions. Due to the higher flexibility in exploring and exploiting\nthe graph structure of data, there is recently an increasing interest in\ninvestigating the possibilities that the spatial approach can offer. The idea\nof finding a way to adapt the network behaviour to the inputs they process to\nmaximize the total performances has aroused much interest in the neural\nnetworks literature over the years. This paper presents a novel method to adapt\nthe behaviour of a ConvGNN to the input proposing a method to perform spatial\nconvolution on graphs using input-specific filters, which are dynamically\ngenerated from nodes feature vectors. The experimental assessment confirms the\ncapabilities of the proposed approach, which achieves satisfying results using\na low number of filters.\n","authors":["Andrea Apicella","Francesco Isgrò","Andrea Pollastro","Roberto Prevete"],"pdf_url":"https://arxiv.org/pdf/2105.10377v4.pdf","comment":"This paper has been published in its final version on \\textit{Pattern\n  Recognition} journal with DOI https://doi.org/10.1016/j.patcog.2023.109867 in\n  Open Access mode. Please consider it as final and peer-reviewed version"},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2210.05674v4","updated":"2023-08-14T07:57:48Z","published":"2022-10-11T07:39:08Z","title":"Semi-supervised detection of structural damage using Variational\n  Autoencoder and a One-Class Support Vector Machine","summary":"  In recent years, Artificial Neural Networks (ANNs) have been introduced in\nStructural Health Monitoring (SHM) systems. A semi-supervised method with a\ndata-driven approach allows the ANN training on data acquired from an undamaged\nstructural condition to detect structural damages. In standard approaches,\nafter the training stage, a decision rule is manually defined to detect\nanomalous data. However, this process could be made automatic using machine\nlearning methods, whom performances are maximised using hyperparameter\noptimization techniques. The paper proposes a semi-supervised method with a\ndata-driven approach to detect structural anomalies. The methodology consists\nof: (i) a Variational Autoencoder (VAE) to approximate undamaged data\ndistribution and (ii) a One-Class Support Vector Machine (OC-SVM) to\ndiscriminate different health conditions using damage sensitive features\nextracted from VAE's signal reconstruction. The method is applied to a scale\nsteel structure that was tested in nine damage's scenarios by IASC-ASCE\nStructural Health Monitoring Task Group.\n","authors":["Andrea Pollastro","Giusiana Testa","Antonio Bilotta","Roberto Prevete"],"pdf_url":"https://arxiv.org/pdf/2210.05674v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06987v1","updated":"2023-08-14T07:51:15Z","published":"2023-08-14T07:51:15Z","title":"Deep convolutional neural networks for cyclic sensor data","summary":"  Predictive maintenance plays a critical role in ensuring the uninterrupted\noperation of industrial systems and mitigating the potential risks associated\nwith system failures. This study focuses on sensor-based condition monitoring\nand explores the application of deep learning techniques using a hydraulic\nsystem testbed dataset. Our investigation involves comparing the performance of\nthree models: a baseline model employing conventional methods, a single CNN\nmodel with early sensor fusion, and a two-lane CNN model (2L-CNN) with late\nsensor fusion. The baseline model achieves an impressive test error rate of 1%\nby employing late sensor fusion, where feature extraction is performed\nindividually for each sensor. However, the CNN model encounters challenges due\nto the diverse sensor characteristics, resulting in an error rate of 20.5%. To\nfurther investigate this issue, we conduct separate training for each sensor\nand observe variations in accuracy. Additionally, we evaluate the performance\nof the 2L-CNN model, which demonstrates significant improvement by reducing the\nerror rate by 33% when considering the combination of the least and most\noptimal sensors. This study underscores the importance of effectively\naddressing the complexities posed by multi-sensor systems in sensor-based\ncondition monitoring.\n","authors":["Payman Goodarzi","Yannick Robin","Andreas Schütze","Tizian Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.06987v1.pdf","comment":"4 pages, 3 figures, submitted to the IEEE Sensors Conference"},{"id":"http://arxiv.org/abs/2304.02849v2","updated":"2023-08-14T07:38:32Z","published":"2023-04-06T03:45:07Z","title":"Logistic-Normal Likelihoods for Heteroscedastic Label Noise","summary":"  A natural way of estimating heteroscedastic label noise in regression is to\nmodel the observed (potentially noisy) target as a sample from a normal\ndistribution, whose parameters can be learned by minimizing the negative\nlog-likelihood. This formulation has desirable loss attenuation properties, as\nit reduces the contribution of high-error examples. Intuitively, this behavior\ncan improve robustness against label noise by reducing overfitting. We propose\nan extension of this simple and probabilistic approach to classification that\nhas the same desirable loss attenuation properties. Furthermore, we discuss and\naddress some practical challenges of this extension. We evaluate the\neffectiveness of the method by measuring its robustness against label noise in\nclassification. We perform enlightening experiments exploring the inner\nworkings of the method, including sensitivity to hyperparameters, ablation\nstudies, and other insightful analyses.\n","authors":["Erik Englesson","Amir Mehrpanah","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2304.02849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06983v1","updated":"2023-08-14T07:35:43Z","published":"2023-08-14T07:35:43Z","title":"pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based\n  Unsupervised Representation Learning Problems","summary":"  Nearest neighbor (NN) sampling provides more semantic variations than\npre-defined transformations for self-supervised learning (SSL) based image\nrecognition problems. However, its performance is restricted by the quality of\nthe support set, which holds positive samples for the contrastive loss. In this\nwork, we show that the quality of the support set plays a crucial role in any\nnearest neighbor based method for SSL. We then provide a refined baseline\n(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we\nintroduce pseudo nearest neighbors (pNN) to control the quality of the support\nset, wherein, rather than sampling the nearest neighbors, we sample in the\nvicinity of hard nearest neighbors by varying the magnitude of the resultant\nvector and employing a stochastic sampling strategy to improve the performance.\nAdditionally, to stabilize the effects of uncertainty in NN-based learning, we\nemploy a smooth-weight-update approach for training the proposed network.\nEvaluation of the proposed method on multiple public image recognition and\nmedical image recognition datasets shows that it performs up to 8 percent\nbetter than the baseline nearest neighbor method, and is comparable to other\npreviously proposed SSL methods.\n","authors":["Momojit Biswas","Himanshu Buckchash","Dilip K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2308.06983v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.15941v2","updated":"2023-08-14T07:15:21Z","published":"2023-07-29T09:29:09Z","title":"Continual Learning in Predictive Autoscaling","summary":"  Predictive Autoscaling is used to forecast the workloads of servers and\nprepare the resources in advance to ensure service level objectives (SLOs) in\ndynamic cloud environments. However, in practice, its prediction task often\nsuffers from performance degradation under abnormal traffics caused by external\nevents (such as sales promotional activities and applications\nre-configurations), for which a common solution is to re-train the model with\ndata of a long historical period, but at the expense of high computational and\nstorage costs. To better address this problem, we propose a replay-based\ncontinual learning method, i.e., Density-based Memory Selection and Hint-based\nNetwork Learning Model (DMSHM), using only a small part of the historical log\nto achieve accurate predictions. First, we discover the phenomenon of sample\noverlap when applying replay-based continual learning in prediction tasks. In\norder to surmount this challenge and effectively integrate new sample\ndistribution, we propose a density-based sample selection strategy that\nutilizes kernel density estimation to calculate sample density as a reference\nto compute sample weight, and employs weight sampling to construct a new memory\nset. Then we implement hint-based network learning based on hint representation\nto optimize the parameters. Finally, we conduct experiments on public and\nindustrial datasets to demonstrate that our proposed method outperforms\nstate-of-the-art continual learning methods in terms of memory capacity and\nprediction accuracy. Furthermore, we demonstrate remarkable practicability of\nDMSHM in real industrial applications.\n","authors":["Hongyan Hao","Zhixuan Chu","Shiyi Zhu","Gangwei Jiang","Yan Wang","Caigao Jiang","James Zhang","Wei Jiang","Siqiao Xue","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.15941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06973v1","updated":"2023-08-14T07:11:55Z","published":"2023-08-14T07:11:55Z","title":"Routing Recovery for UAV Networks with Deliberate Attacks: A\n  Reinforcement Learning based Approach","summary":"  The unmanned aerial vehicle (UAV) network is popular these years due to its\nvarious applications. In the UAV network, routing is significantly affected by\nthe distributed network topology, leading to the issue that UAVs are vulnerable\nto deliberate damage. Hence, this paper focuses on the routing plan and\nrecovery for UAV networks with attacks. In detail, a deliberate attack model\nbased on the importance of nodes is designed to represent enemy attacks. Then,\na node importance ranking mechanism is presented, considering the degree of\nnodes and link importance. However, it is intractable to handle the routing\nproblem by traditional methods for UAV networks, since link connections change\nwith the UAV availability. Hence, an intelligent algorithm based on\nreinforcement learning is proposed to recover the routing path when UAVs are\nattacked. Simulations are conducted and numerical results verify the proposed\nmechanism performs better than other referred methods.\n","authors":["Sijie He","Ziye Jia","Chao Dong","Wei Wang","Yilu Cao","Yang Yang","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06973v1.pdf","comment":"IEEE GLOBECOM 2023, 6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.06965v1","updated":"2023-08-14T06:43:59Z","published":"2023-08-14T06:43:59Z","title":"AutoAssign+: Automatic Shared Embedding Assignment in Streaming\n  Recommendation","summary":"  In the domain of streaming recommender systems, conventional methods for\naddressing new user IDs or item IDs typically involve assigning initial ID\nembeddings randomly. However, this practice results in two practical\nchallenges: (i) Items or users with limited interactive data may yield\nsuboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs\nnecessitates consistently expanding the embedding table, leading to unnecessary\nmemory consumption. In light of these concerns, we introduce a reinforcement\nlearning-driven framework, namely AutoAssign+, that facilitates Automatic\nShared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an\nIdentity Agent as an actor network, which plays a dual role: (i) Representing\nlow-frequency IDs field-wise with a small set of shared embeddings to enhance\nthe embedding initialization, and (ii) Dynamically determining which ID\nfeatures should be retained or eliminated in the embedding table. The policy of\nthe agent is optimized with the guidance of a critic network. To evaluate the\neffectiveness of our approach, we perform extensive experiments on three\ncommonly used benchmark datasets. Our experiment results demonstrate that\nAutoAssign+ is capable of significantly enhancing recommendation performance by\nmitigating the cold-start problem. Furthermore, our framework yields a\nreduction in memory usage of approximately 20-30%, verifying its practical\neffectiveness and efficiency for streaming recommender systems.\n","authors":["Ziru Liu","Kecheng Chen","Fengyi Song","Bo Chen","Xiangyu Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06961v1","updated":"2023-08-14T06:32:52Z","published":"2023-08-14T06:32:52Z","title":"Graph Structural Residuals: A Learning Approach to Diagnosis","summary":"  Traditional model-based diagnosis relies on constructing explicit system\nmodels, a process that can be laborious and expertise-demanding. In this paper,\nwe propose a novel framework that combines concepts of model-based diagnosis\nwith deep graph structure learning. This data-driven approach leverages data to\nlearn the system's underlying structure and provide dynamic observations,\nrepresented by two distinct graph adjacency matrices. Our work facilitates a\nseamless integration of graph structure learning with model-based diagnosis by\nmaking three main contributions: (i) redefining the constructs of system\nrepresentation, observations, and faults (ii) introducing two distinct versions\nof a self-supervised graph structure learning model architecture and (iii)\ndemonstrating the potential of our data-driven diagnostic method through\nexperiments on a system of coupled oscillators.\n","authors":["Jan Lukas Augustin","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2308.06961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06960v1","updated":"2023-08-14T06:32:02Z","published":"2023-08-14T06:32:02Z","title":"Search to Fine-tune Pre-trained Graph Neural Networks for Graph-level\n  Tasks","summary":"  Recently, graph neural networks (GNNs) have shown its unprecedented success\nin many graph-related tasks. However, GNNs face the label scarcity issue as\nother neural networks do. Thus, recent efforts try to pre-train GNNs on a\nlarge-scale unlabeled graph and adapt the knowledge from the unlabeled graph to\nthe target downstream task. The adaptation is generally achieved by fine-tuning\nthe pre-trained GNNs with a limited number of labeled data. Despite the\nimportance of fine-tuning, current GNNs pre-training works often ignore\ndesigning a good fine-tuning strategy to better leverage transferred knowledge\nand improve the performance on downstream tasks. Only few works start to\ninvestigate a better fine-tuning strategy for pre-trained GNNs. But their\ndesigns either have strong assumptions or overlook the data-aware issue for\nvarious downstream datasets. Therefore, we aim to design a better fine-tuning\nstrategy for pre-trained GNNs to improve the model performance in this paper.\nGiven a pre-trained GNN, we propose to search to fine-tune pre-trained graph\nneural networks for graph-level tasks (S2PGNN), which adaptively design a\nsuitable fine-tuning framework for the given labeled data on the downstream\ntask. To ensure the improvement brought by searching fine-tuning strategy, we\ncarefully summarize a proper search space of fine-tuning framework that is\nsuitable for GNNs. The empirical studies show that S2PGNN can be implemented on\nthe top of 10 famous pre-trained GNNs and consistently improve their\nperformance. Besides, S2PGNN achieves better performance than existing\nfine-tuning strategies within and outside the GNN area. Our code is publicly\navailable at \\url{https://anonymous.4open.science/r/code_icde2024-A9CB/}.\n","authors":["Zhili Wang","Shimin Di","Lei Chen","Xiaofang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.06960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06959v1","updated":"2023-08-14T06:29:09Z","published":"2023-08-14T06:29:09Z","title":"Data-Driven Allocation of Preventive Care With Application to Diabetes\n  Mellitus Type II","summary":"  Problem Definition. Increasing costs of healthcare highlight the importance\nof effective disease prevention. However, decision models for allocating\npreventive care are lacking.\n  Methodology/Results. In this paper, we develop a data-driven decision model\nfor determining a cost-effective allocation of preventive treatments to\npatients at risk. Specifically, we combine counterfactual inference, machine\nlearning, and optimization techniques to build a scalable decision model that\ncan exploit high-dimensional medical data, such as the data found in modern\nelectronic health records. Our decision model is evaluated based on electronic\nhealth records from 89,191 prediabetic patients. We compare the allocation of\npreventive treatments (metformin) prescribed by our data-driven decision model\nwith that of current practice. We find that if our approach is applied to the\nU.S. population, it can yield annual savings of $1.1 billion. Finally, we\nanalyze the cost-effectiveness under varying budget levels.\n  Managerial Implications. Our work supports decision-making in health\nmanagement, with the goal of achieving effective disease prevention at lower\ncosts. Importantly, our decision model is generic and can thus be used for\neffective allocation of preventive care for other preventable diseases.\n","authors":["Mathias Kraus","Stefan Feuerriegel","Maytal Saar-Tsechansky"],"pdf_url":"https://arxiv.org/pdf/2308.06959v1.pdf","comment":"Accepted by Manufacturing & Service Operations Management"},{"id":"http://arxiv.org/abs/2308.06957v1","updated":"2023-08-14T06:22:49Z","published":"2023-08-14T06:22:49Z","title":"CEmb-SAM: Segment Anything Model with Condition Embedding for Joint\n  Learning from Heterogeneous Datasets","summary":"  Automated segmentation of ultrasound images can assist medical experts with\ndiagnostic and therapeutic procedures. Although using the common modality of\nultrasound, one typically needs separate datasets in order to segment, for\nexample, different anatomical structures or lesions with different levels of\nmalignancy. In this paper, we consider the problem of jointly learning from\nheterogeneous datasets so that the model can improve generalization abilities\nby leveraging the inherent variability among datasets. We merge the\nheterogeneous datasets into one dataset and refer to each component dataset as\na subgroup. We propose to train a single segmentation model so that the model\ncan adapt to each sub-group. For robust segmentation, we leverage recently\nproposed Segment Anything model (SAM) in order to incorporate sub-group\ninformation into the model. We propose SAM with Condition Embedding block\n(CEmb-SAM) which encodes sub-group conditions and combines them with image\nembeddings from SAM. The conditional embedding block effectively adapts SAM to\neach image sub-group by incorporating dataset properties through learnable\nparameters for normalization. Experiments show that CEmb-SAM outperforms the\nbaseline methods on ultrasound image segmentation for peripheral nerves and\nbreast cancer. The experiments highlight the effectiveness of Cemb-SAM in\nlearning from heterogeneous datasets in medical image segmentation tasks.\n","authors":["Dongik Shin","Beomsuk Kim","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2308.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06952v1","updated":"2023-08-14T06:04:50Z","published":"2023-08-14T06:04:50Z","title":"Channel-Wise Contrastive Learning for Learning with Noisy Labels","summary":"  In real-world datasets, noisy labels are pervasive. The challenge of learning\nwith noisy labels (LNL) is to train a classifier that discerns the actual\nclasses from given instances. For this, the model must identify features\nindicative of the authentic labels. While research indicates that genuine label\ninformation is embedded in the learned features of even inaccurately labeled\ndata, it's often intertwined with noise, complicating its direct application.\nAddressing this, we introduce channel-wise contrastive learning (CWCL). This\nmethod distinguishes authentic label information from noise by undertaking\ncontrastive learning across diverse channels. Unlike conventional instance-wise\ncontrastive learning (IWCL), CWCL tends to yield more nuanced and resilient\nfeatures aligned with the authentic labels. Our strategy is twofold: firstly,\nusing CWCL to extract pertinent features to identify cleanly labeled samples,\nand secondly, progressively fine-tuning using these samples. Evaluations on\nseveral benchmark datasets validate our method's superiority over existing\napproaches.\n","authors":["Hui Kang","Sheng Liu","Huaxi Huang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06947v1","updated":"2023-08-14T05:54:32Z","published":"2023-08-14T05:54:32Z","title":"Knowing Where to Focus: Event-aware Transformer for Video Grounding","summary":"  Recent DETR-based video grounding models have made the model directly predict\nmoment timestamps without any hand-crafted components, such as a pre-defined\nproposal or non-maximum suppression, by learning moment queries. However, their\ninput-agnostic moment queries inevitably overlook an intrinsic temporal\nstructure of a video, providing limited positional information. In this paper,\nwe formulate an event-aware dynamic moment query to enable the model to take\nthe input-specific content and positional information of the video into\naccount. To this end, we present two levels of reasoning: 1) Event reasoning\nthat captures distinctive event units constituting a given video using a slot\nattention mechanism; and 2) moment reasoning that fuses the moment queries with\na given sentence through a gated fusion transformer layer and learns\ninteractions between the moment queries and video-sentence representations to\npredict moment timestamps. Extensive experiments demonstrate the effectiveness\nand efficiency of the event-aware dynamic moment queries, outperforming\nstate-of-the-art approaches on several video grounding benchmarks.\n","authors":["Jinhyun Jang","Jungin Park","Jin Kim","Hyeongjun Kwon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06947v1.pdf","comment":"ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR"},{"id":"http://arxiv.org/abs/2308.06945v1","updated":"2023-08-14T05:37:07Z","published":"2023-08-14T05:37:07Z","title":"Semantic-aware Network for Aerial-to-Ground Image Synthesis","summary":"  Aerial-to-ground image synthesis is an emerging and challenging problem that\naims to synthesize a ground image from an aerial image. Due to the highly\ndifferent layout and object representation between the aerial and ground\nimages, existing approaches usually fail to transfer the components of the\naerial scene into the ground scene. In this paper, we propose a novel framework\nto explore the challenges by imposing enhanced structural alignment and\nsemantic awareness. We introduce a novel semantic-attentive feature\ntransformation module that allows to reconstruct the complex geographic\nstructures by aligning the aerial feature to the ground layout. Furthermore, we\npropose semantic-aware loss functions by leveraging a pre-trained segmentation\nnetwork. The network is enforced to synthesize realistic objects across various\nclasses by separately calculating losses for different classes and balancing\nthem. Extensive experiments including comparisons with previous methods and\nablation studies show the effectiveness of the proposed framework both\nqualitatively and quantitatively.\n","authors":["Jinhyun Jang","Taeyong Song","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06945v1.pdf","comment":"ICIP 2021. Code is available at https://github.com/jinhyunj/SANet"},{"id":"http://arxiv.org/abs/2210.13869v4","updated":"2023-08-14T05:33:48Z","published":"2022-10-25T09:45:49Z","title":"A jet tagging algorithm of graph network with HaarPooling message\n  passing","summary":"  Recently methods of graph neural networks (GNNs) have been applied to solving\nthe problems in high energy physics (HEP) and have shown its great potential\nfor quark-gluon tagging with graph representation of jet events. In this paper,\nwe introduce an approach of GNNs combined with a HaarPooling operation to\nanalyze the events, called HaarPooling Message Passing neural network (HMPNet).\nIn HMPNet, HaarPooling not only extracts the features of graph, but embeds\nadditional information obtained by clustering of k-means of different particle\nfeatures. We construct Haarpooling from five different features: absolute\nenergy $\\log E$, transverse momentum $\\log p_T$, relative coordinates\n$(\\Delta\\eta,\\Delta\\phi)$, the mixed ones $(\\log E, \\log p_T)$ and $(\\log E,\n\\log p_T, \\Delta\\eta,\\Delta\\phi)$. The results show that an appropriate\nselection of information for HaarPooling enhances the accuracy of quark-gluon\ntagging, as adding extra information of $\\log P_T$ to the HMPNet outperforms\nall the others, whereas adding relative coordinates information\n$(\\Delta\\eta,\\Delta\\phi)$ is not very effective. This implies that by adding\neffective particle features from HaarPooling can achieve much better results\nthan solely pure message passing neutral network (MPNN) can do, which\ndemonstrates significant improvement of feature extraction via the pooling\nprocess. Finally we compare the HMPNet study, ordering by $p_T$, with other\nstudies and prove that the HMPNet is also a good choice of GNN algorithms for\njet tagging.\n","authors":["Fei Ma","Feiyi Liu","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2210.13869v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09833v3","updated":"2023-08-14T05:29:40Z","published":"2022-08-21T07:47:05Z","title":"Label-Noise Learning with Intrinsically Long-Tailed Data","summary":"  Label noise is one of the key factors that lead to the poor generalization of\ndeep learning models. Existing label-noise learning methods usually assume that\nthe ground-truth classes of the training data are balanced. However, the\nreal-world data is often imbalanced, leading to the inconsistency between\nobserved and intrinsic class distribution with label noises. In this case, it\nis hard to distinguish clean samples from noisy samples on the intrinsic tail\nclasses with the unknown intrinsic class distribution. In this paper, we\npropose a learning framework for label-noise learning with intrinsically\nlong-tailed data. Specifically, we propose two-stage bi-dimensional sample\nselection (TABASCO) to better separate clean samples from noisy samples,\nespecially for the tail classes. TABASCO consists of two new separation metrics\nthat complement each other to compensate for the limitation of using a single\nmetric in sample separation. Extensive experiments on benchmarks demonstrate\nthe effectiveness of our method. Our code is available at\nhttps://github.com/Wakings/TABASCO.\n","authors":["Yang Lu","Yiliang Zhang","Bo Han","Yiu-ming Cheung","Hanzi Wang"],"pdf_url":"https://arxiv.org/pdf/2208.09833v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2302.12444v3","updated":"2023-08-14T05:22:10Z","published":"2023-02-24T04:10:54Z","title":"On the Training Instability of Shuffling SGD with Batch Normalization","summary":"  We uncover how SGD interacts with batch normalization and can exhibit\nundesirable training dynamics such as divergence. More precisely, we study how\nSingle Shuffle (SS) and Random Reshuffle (RR) -- two widely used variants of\nSGD -- interact surprisingly differently in the presence of batch\nnormalization: RR leads to much more stable evolution of training loss than SS.\nAs a concrete example, for regression using a linear network with batch\nnormalization, we prove that SS and RR converge to distinct global optima that\nare \"distorted\" away from gradient descent. Thereafter, for classification we\ncharacterize conditions under which training divergence for SS and RR can, and\ncannot occur. We present explicit constructions to show how SS leads to\ndistorted optima in regression and divergence for classification, whereas RR\navoids both distortion and divergence. We validate our results by confirming\nthem empirically in realistic settings, and conclude that the separation\nbetween SS and RR used with batch normalization is relevant in practice.\n","authors":["David X. Wu","Chulhee Yun","Suvrit Sra"],"pdf_url":"https://arxiv.org/pdf/2302.12444v3.pdf","comment":"ICML 2023 camera-ready version, added references; 75 pages"},{"id":"http://arxiv.org/abs/2308.03669v2","updated":"2023-08-14T05:17:09Z","published":"2023-08-07T15:40:34Z","title":"Diffusion Model in Causal Inference with Unmeasured Confounders","summary":"  We study how to extend the use of the diffusion model to answer the causal\nquestion from the observational data under the existence of unmeasured\nconfounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to\ncapture the causal intervention, a Diffusion-based Causal Model (DCM) was\nproposed incorporating the diffusion model to answer the causal questions more\naccurately, assuming that all of the confounders are observed. However,\nunmeasured confounders in practice exist, which hinders DCM from being\napplicable. To alleviate this limitation of DCM, we propose an extended model\ncalled Backdoor Criterion based DCM (BDCM), whose idea is rooted in the\nBackdoor criterion to find the variables in DAG to be included in the decoding\nprocess of the diffusion model so that we can extend DCM to the case with\nunmeasured confounders. Synthetic data experiment demonstrates that our\nproposed model captures the counterfactual distribution more precisely than DCM\nunder the unmeasured confounders.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03669v2.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.02293v2","updated":"2023-08-14T05:06:09Z","published":"2023-08-04T12:57:13Z","title":"A stochastic optimization approach to train non-linear neural networks\n  with a higher-order variation regularization","summary":"  While highly expressive parametric models including deep neural networks have\nan advantage to model complicated concepts, training such highly non-linear\nmodels is known to yield a high risk of notorious overfitting. To address this\nissue, this study considers a $(k,q)$th order variation regularization\n($(k,q)$-VR), which is defined as the $q$th-powered integral of the absolute\n$k$th order derivative of the parametric models to be trained; penalizing the\n$(k,q)$-VR is expected to yield a smoother function, which is expected to avoid\noverfitting. Particularly, $(k,q)$-VR encompasses the conventional\n(general-order) total variation with $q=1$. While the $(k,q)$-VR terms applied\nto general parametric models are computationally intractable due to the\nintegration, this study provides a stochastic optimization algorithm, that can\nefficiently train general models with the $(k,q)$-VR without conducting\nexplicit numerical integration. The proposed approach can be applied to the\ntraining of even deep neural networks whose structure is arbitrary, as it can\nbe implemented by only a simple stochastic gradient descent algorithm and\nautomatic differentiation. Our numerical experiments demonstrate that the\nneural networks trained with the $(k,q)$-VR terms are more ``resilient'' than\nthose with the conventional parameter regularization. The proposed algorithm\nalso can be extended to the physics-informed training of neural networks\n(PINNs).\n","authors":["Akifumi Okuno"],"pdf_url":"https://arxiv.org/pdf/2308.02293v2.pdf","comment":"13 pages, 24 figures"},{"id":"http://arxiv.org/abs/2308.06935v1","updated":"2023-08-14T04:44:56Z","published":"2023-08-14T04:44:56Z","title":"Insurance pricing on price comparison websites via reinforcement\n  learning","summary":"  The emergence of price comparison websites (PCWs) has presented insurers with\nunique challenges in formulating effective pricing strategies. Operating on\nPCWs requires insurers to strike a delicate balance between competitive\npremiums and profitability, amidst obstacles such as low historical conversion\nrates, limited visibility of competitors' actions, and a dynamic market\nenvironment. In addition to this, the capital intensive nature of the business\nmeans pricing below the risk levels of customers can result in solvency issues\nfor the insurer. To address these challenges, this paper introduces\nreinforcement learning (RL) framework that learns the optimal pricing policy by\nintegrating model-based and model-free methods. The model-based component is\nused to train agents in an offline setting, avoiding cold-start issues, while\nmodel-free algorithms are then employed in a contextual bandit (CB) manner to\ndynamically update the pricing policy to maximise the expected revenue. This\nfacilitates quick adaptation to evolving market dynamics and enhances algorithm\nefficiency and decision interpretability. The paper also highlights the\nimportance of evaluating pricing policies using an offline dataset in a\nconsistent fashion and demonstrates the superiority of the proposed methodology\nover existing off-the-shelf RL/CB approaches. We validate our methodology using\nsynthetic data, generated to reflect private commercially available data within\nreal-world insurers, and compare against 6 other benchmark approaches. Our\nhybrid agent outperforms these benchmarks in terms of sample efficiency and\ncumulative reward with the exception of an agent that has access to perfect\nmarket information which would not be available in a real-world set-up.\n","authors":["Tanut Treetanthiploet","Yufei Zhang","Lukasz Szpruch","Isaac Bowers-Barnard","Henrietta Ridley","James Hickey","Chris Pearce"],"pdf_url":"https://arxiv.org/pdf/2308.06935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06929v1","updated":"2023-08-14T04:15:09Z","published":"2023-08-14T04:15:09Z","title":"Predicting Listing Prices In Dynamic Short Term Rental Markets Using\n  Machine Learning Models","summary":"  Our research group wanted to take on the difficult task of predicting prices\nin a dynamic market. And short term rentals such as Airbnb listings seemed to\nbe the perfect proving ground to do such a thing. Airbnb has revolutionized the\ntravel industry by providing a platform for homeowners to rent out their\nproperties to travelers. The pricing of Airbnb rentals is prone to high\nfluctuations, with prices changing frequently based on demand, seasonality, and\nother factors. Accurate prediction of Airbnb rental prices is crucial for hosts\nto optimize their revenue and for travelers to make informed booking decisions.\nIn this project, we aim to predict the prices of Airbnb rentals using a machine\nlearning modeling approach.\n  Our project expands on earlier research in the area of analyzing Airbnb\nrental prices by taking a methodical machine learning approach as well as\nincorporating sentiment analysis into our feature engineering. We intend to\ngain a deeper understanding on periodic changes of Airbnb rental prices. The\nprimary objective of this study is to construct an accurate machine learning\nmodel for predicting Airbnb rental prices specifically in Austin, Texas. Our\nproject's secondary objective is to identify the key factors that drive Airbnb\nrental prices and to investigate how these factors vary across different\nlocations and property types.\n","authors":["Sam Chapman","Seifey Mohammad","Kimberly Villegas"],"pdf_url":"https://arxiv.org/pdf/2308.06929v1.pdf","comment":"40 pages, 10 tables, 12 figures"},{"id":"http://arxiv.org/abs/2308.06925v1","updated":"2023-08-14T04:03:51Z","published":"2023-08-14T04:03:51Z","title":"CBA: Improving Online Continual Learning via Continual Bias Adaptor","summary":"  Online continual learning (CL) aims to learn new knowledge and consolidate\npreviously learned knowledge from non-stationary data streams. Due to the\ntime-varying training setting, the model learned from a changing distribution\neasily forgets the previously learned knowledge and biases toward the newly\nreceived task. To address this problem, we propose a Continual Bias Adaptor\n(CBA) module to augment the classifier network to adapt to catastrophic\ndistribution change during training, such that the classifier network is able\nto learn a stable consolidation of previously learned tasks. In the testing\nstage, CBA can be removed which introduces no additional computation cost and\nmemory overhead. We theoretically reveal the reason why the proposed method can\neffectively alleviate catastrophic distribution shifts, and empirically\ndemonstrate its effectiveness through extensive experiments based on four\nrehearsal-based baselines and three public continual learning benchmarks.\n","authors":["Quanziang Wang","Renzhen Wang","Yichen Wu","Xixi Jia","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.06925v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.06937v2","updated":"2023-08-14T03:46:51Z","published":"2023-03-13T09:11:54Z","title":"TARGET: Federated Class-Continual Learning via Exemplar-Free\n  Distillation","summary":"  This paper focuses on an under-explored yet important problem: Federated\nClass-Continual Learning (FCCL), where new classes are dynamically added in\nfederated learning. Existing FCCL works suffer from various limitations, such\nas requiring additional datasets or storing the private data from previous\ntasks. In response, we first demonstrate that non-IID data exacerbates\ncatastrophic forgetting issue in FL. Then we propose a novel method called\nTARGET (federat\\textbf{T}ed cl\\textbf{A}ss-continual lea\\textbf{R}nin\\textbf{G}\nvia \\textbf{E}xemplar-free dis\\textbf{T}illation), which alleviates\ncatastrophic forgetting in FCCL while preserving client data privacy. Our\nproposed method leverages the previously trained global model to transfer\nknowledge of old tasks to the current task at the model level. Moreover, a\ngenerator is trained to produce synthetic data to simulate the global\ndistribution of data on each client at the data level. Compared to previous\nFCCL methods, TARGET does not require any additional datasets or storing real\ndata from previous tasks, which makes it ideal for data-sensitive scenarios.\n","authors":["Jie Zhang","Chen Chen","Weiming Zhuang","Lingjuan Lv"],"pdf_url":"https://arxiv.org/pdf/2303.06937v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06053v2","updated":"2023-08-14T03:19:03Z","published":"2023-08-11T10:05:53Z","title":"Cost-effective On-device Continual Learning over Memory Hierarchy with\n  Miro","summary":"  Continual learning (CL) trains NN models incrementally from a continuous\nstream of tasks. To remember previously learned knowledge, prior studies store\nold samples over a memory hierarchy and replay them when new tasks arrive. Edge\ndevices that adopt CL to preserve data privacy are typically energy-sensitive\nand thus require high model accuracy while not compromising energy efficiency,\ni.e., cost-effectiveness. Our work is the first to explore the design space of\nhierarchical memory replay-based CL to gain insights into achieving\ncost-effectiveness on edge devices. We present Miro, a novel system runtime\nthat carefully integrates our insights into the CL framework by enabling it to\ndynamically configure the CL system based on resource states for the best\ncost-effectiveness. To reach this goal, Miro also performs online profiling on\nparameters with clear accuracy-energy trade-offs and adapts to optimal values\nwith low overhead. Extensive evaluations show that Miro significantly\noutperforms baseline systems we build for comparison, consistently achieving\nhigher cost-effectiveness.\n","authors":["Xinyue Ma","Suyeon Jeong","Minjia Zhang","Di Wang","Jonghyun Choi","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2308.06053v2.pdf","comment":"This paper is to be published in the 29th Annual International\n  Conference on Mobile Computing and Networking (ACM MobiCom 23)"},{"id":"http://arxiv.org/abs/2308.06912v1","updated":"2023-08-14T03:14:38Z","published":"2023-08-14T03:14:38Z","title":"CausalLM is not optimal for in-context learning","summary":"  Recent empirical evidence indicates that transformer based in-context\nlearning performs better when using a prefix language model (prefixLM), in\nwhich in-context samples can all attend to each other, compared to causal\nlanguage models (causalLM), which use auto-regressive attention that prohibits\nin-context samples to attend to future samples. While this result is intuitive,\nit is not understood from a theoretical perspective. In this paper we take a\ntheoretical approach and analyze the convergence behavior of prefixLM and\ncausalLM under a certain parameter construction. Our analysis shows that both\nLM types converge to their stationary points at a linear rate, but that while\nprefixLM converges to the optimal solution of linear regression, causalLM\nconvergence dynamics follows that of an online gradient descent algorithm,\nwhich is not guaranteed to be optimal even as the number of samples grows\ninfinitely. We supplement our theoretical claims with empirical experiments\nover synthetic and real tasks and using various types of transformers. Our\nexperiments verify that causalLM consistently underperforms prefixLM in all\nsettings.\n","authors":["Nan Ding","Tomer Levinboim","Jialin Wu","Sebastian Goodman","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2308.06912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v1","updated":"2023-08-14T03:12:29Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n  Graph, Image, and Text","summary":"  Large language models have made significant strides in natural language\nprocessing, paving the way for innovative applications including molecular\nrepresentation and generation. However, most existing single-modality\napproaches cannot capture the abundant and complex information in molecular\ndata. Here, we introduce GIT-Mol, a multi-modal large language model that\nintegrates the structure Graph, Image, and Text information, including the\nSimplified Molecular Input Line Entry System (SMILES) and molecular captions.\nTo facilitate the integration of multi-modal molecular data, we propose\nGIT-Former, a novel architecture capable of mapping all modalities into a\nunified latent space. Our study develops an innovative any-to-language\nmolecular translation strategy and achieves a 10%-15% improvement in molecular\ncaptioning, a 5%-10% accuracy increase in property prediction, and a 20% boost\nin molecule generation validity compared to baseline or single-modality models.\n","authors":["Pengfei Liu","Yiming Ren","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06907v1","updated":"2023-08-14T02:59:27Z","published":"2023-08-14T02:59:27Z","title":"Generative Interpretation","summary":"  We introduce generative interpretation, a new approach to estimating\ncontractual meaning using large language models. As AI triumphalism is the\norder of the day, we proceed by way of grounded case studies, each illustrating\nthe capabilities of these novel tools in distinct ways. Taking well-known\ncontracts opinions, and sourcing the actual agreements that they adjudicated,\nwe show that AI models can help factfinders ascertain ordinary meaning in\ncontext, quantify ambiguity, and fill gaps in parties' agreements. We also\nillustrate how models can calculate the probative value of individual pieces of\nextrinsic evidence. After offering best practices for the use of these models\ngiven their limitations, we consider their implications for judicial practice\nand contract theory. Using LLMs permits courts to estimate what the parties\nintended cheaply and accurately, and as such generative interpretation\nunsettles the current interpretative stalemate. Their use responds to\nefficiency-minded textualists and justice-oriented contextualists, who argue\nabout whether parties will prefer cost and certainty or accuracy and fairness.\nParties--and courts--would prefer a middle path, in which adjudicators strive\nto predict what the contract really meant, admitting just enough context to\napproximate reality while avoiding unguided and biased assimilation of\nevidence. As generative interpretation offers this possibility, we argue it can\nbecome the new workhorse of contractual interpretation.\n","authors":["Yonathan A. Arbel","David Hoffman"],"pdf_url":"https://arxiv.org/pdf/2308.06907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04669v2","updated":"2023-08-14T02:52:02Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":"  A variety of Neural Radiance Fields (NeRF) methods have recently achieved\nremarkable success in high render speed. However, current accelerating methods\nare specialized and incompatible with various implicit methods, preventing\nreal-time composition over various types of NeRF works. Because NeRF relies on\nsampling along rays, it is possible to provide general guidance for\nacceleration. To that end, we propose a general implicit pipeline for composing\nNeRF objects quickly. Our method enables the casting of dynamic shadows within\nor between objects using analytical light sources while allowing multiple NeRF\nobjects to be seamlessly placed and rendered together with any arbitrary rigid\ntransformations. Mainly, our work introduces a new surface representation known\nas Neural Depth Fields (NeDF) that quickly determines the spatial relationship\nbetween objects by allowing direct intersection computation between rays and\nimplicit surfaces. It leverages an intersection neural network to query NeRF\nfor acceleration instead of depending on an explicit spatial structure.Our\nproposed method is the first to enable both the progressive and interactive\ncomposition of NeRF objects. Additionally, it also serves as a previewing\nplugin for a range of existing NeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v2.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.06895v1","updated":"2023-08-14T02:25:48Z","published":"2023-08-14T02:25:48Z","title":"Federated Classification in Hyperbolic Spaces via Secure Aggregation of\n  Convex Hulls","summary":"  Hierarchical and tree-like data sets arise in many applications, including\nlanguage processing, graph data mining, phylogeny and genomics. It is known\nthat tree-like data cannot be embedded into Euclidean spaces of finite\ndimension with small distortion. This problem can be mitigated through the use\nof hyperbolic spaces. When such data also has to be processed in a distributed\nand privatized setting, it becomes necessary to work with new federated\nlearning methods tailored to hyperbolic spaces. As an initial step towards the\ndevelopment of the field of federated learning in hyperbolic spaces, we propose\nthe first known approach to federated classification in hyperbolic spaces. Our\ncontributions are as follows. First, we develop distributed versions of convex\nSVM classifiers for Poincar\\'e discs. In this setting, the information conveyed\nfrom clients to the global classifier are convex hulls of clusters present in\nindividual client data. Second, to avoid label switching issues, we introduce a\nnumber-theoretic approach for label recovery based on the so-called integer\n$B_h$ sequences. Third, we compute the complexity of the convex hulls in\nhyperbolic spaces to assess the extent of data leakage; at the same time, in\norder to limit the communication cost for the hulls, we propose a new\nquantization method for the Poincar\\'e disc coupled with Reed-Solomon-like\nencoding. Fourth, at server level, we introduce a new approach for aggregating\nconvex hulls of the clients based on balanced graph partitioning. We test our\nmethod on a collection of diverse data sets, including hierarchical single-cell\nRNA-seq data from different patients distributed across different repositories\nthat have stringent privacy constraints. The classification accuracy of our\nmethod is up to $\\sim 11\\%$ better than its Euclidean counterpart,\ndemonstrating the importance of privacy-preserving learning in hyperbolic\nspaces.\n","authors":["Saurav Prakash","Jin Sima","Chao Pan","Eli Chien","Olgica Milenkovic"],"pdf_url":"https://arxiv.org/pdf/2308.06895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06885v1","updated":"2023-08-14T01:37:02Z","published":"2023-08-14T01:37:02Z","title":"Bridging Offline-Online Evaluation with a Time-dependent and Popularity\n  Bias-free Offline Metric for Recommenders","summary":"  The evaluation of recommendation systems is a complex task. The offline and\nonline evaluation metrics for recommender systems are ambiguous in their true\nobjectives. The majority of recently published papers benchmark their methods\nusing ill-posed offline evaluation methodology that often fails to predict true\nonline performance. Because of this, the impact that academic research has on\nthe industry is reduced. The aim of our research is to investigate and compare\nthe online performance of offline evaluation metrics. We show that penalizing\npopular items and considering the time of transactions during the evaluation\nsignificantly improves our ability to choose the best recommendation model for\na live recommender system. Our results, averaged over five large-size\nreal-world live data procured from recommenders, aim to help the academic\ncommunity to understand better offline evaluation and optimization criteria\nthat are more relevant for real applications of recommender systems.\n","authors":["Petr Kasalický","Rodrigo Alves","Pavel Kordík"],"pdf_url":"https://arxiv.org/pdf/2308.06885v1.pdf","comment":"Accepted to evalRS 2023@KDD"},{"id":"http://arxiv.org/abs/2308.06884v1","updated":"2023-08-14T01:34:34Z","published":"2023-08-14T01:34:34Z","title":"Multi-Receiver Task-Oriented Communications via Multi-Task Deep Learning","summary":"  This paper studies task-oriented, otherwise known as goal-oriented,\ncommunications, in a setting where a transmitter communicates with multiple\nreceivers, each with its own task to complete on a dataset, e.g., images,\navailable at the transmitter. A multi-task deep learning approach that involves\ntraining a common encoder at the transmitter and individual decoders at the\nreceivers is presented for joint optimization of completing multiple tasks and\ncommunicating with multiple receivers. By providing efficient resource\nallocation at the edge of 6G networks, the proposed approach allows the\ncommunications system to adapt to varying channel conditions and achieves\ntask-specific objectives while minimizing transmission overhead. Joint training\nof the encoder and decoders using multi-task learning captures shared\ninformation across tasks and optimizes the communication process accordingly.\nBy leveraging the broadcast nature of wireless communications, multi-receiver\ntask-oriented communications (MTOC) reduces the number of transmissions\nrequired to complete tasks at different receivers. Performance evaluation\nconducted on the MNIST, Fashion MNIST, and CIFAR-10 datasets (with image\nclassification considered for different tasks) demonstrates the effectiveness\nof MTOC in terms of classification accuracy and resource utilization compared\nto single-task-oriented communication systems.\n","authors":["Yalin E. Sagduyu","Tugba Erpek","Aylin Yener","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2308.06884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06882v1","updated":"2023-08-14T01:28:19Z","published":"2023-08-14T01:28:19Z","title":"Quantifying Outlierness of Funds from their Categories using Supervised\n  Similarity","summary":"  Mutual fund categorization has become a standard tool for the investment\nmanagement industry and is extensively used by allocators for portfolio\nconstruction and manager selection, as well as by fund managers for peer\nanalysis and competitive positioning. As a result, a (unintended)\nmiscategorization or lack of precision can significantly impact allocation\ndecisions and investment fund managers. Here, we aim to quantify the effect of\nmiscategorization of funds utilizing a machine learning based approach. We\nformulate the problem of miscategorization of funds as a distance-based outlier\ndetection problem, where the outliers are the data-points that are far from the\nrest of the data-points in the given feature space. We implement and employ a\nRandom Forest (RF) based method of distance metric learning, and compute the\nso-called class-wise outlier measures for each data-point to identify outliers\nin the data. We test our implementation on various publicly available data\nsets, and then apply it to mutual fund data. We show that there is a strong\nrelationship between the outlier measures of the funds and their future returns\nand discuss the implications of our findings.\n","authors":["Dhruv Desai","Ashmita Dhiman","Tushar Sharma","Deepika Sharma","Dhagash Mehta","Stefano Pasquali"],"pdf_url":"https://arxiv.org/pdf/2308.06882v1.pdf","comment":"8 pages, 5 tables, 8 figures"},{"id":"http://arxiv.org/abs/2308.06878v1","updated":"2023-08-14T01:23:37Z","published":"2023-08-14T01:23:37Z","title":"AutoSeqRec: Autoencoder for Efficient Sequential Recommendation","summary":"  Sequential recommendation demonstrates the capability to recommend items by\nmodeling the sequential behavior of users. Traditional methods typically treat\nusers as sequences of items, overlooking the collaborative relationships among\nthem. Graph-based methods incorporate collaborative information by utilizing\nthe user-item interaction graph. However, these methods sometimes face\nchallenges in terms of time complexity and computational efficiency. To address\nthese limitations, this paper presents AutoSeqRec, an incremental\nrecommendation model specifically designed for sequential recommendation tasks.\nAutoSeqRec is based on autoencoders and consists of an encoder and three\ndecoders within the autoencoder architecture. These components consider both\nthe user-item interaction matrix and the rows and columns of the item\ntransition matrix. The reconstruction of the user-item interaction matrix\ncaptures user long-term preferences through collaborative filtering. In\naddition, the rows and columns of the item transition matrix represent the item\nout-degree and in-degree hopping behavior, which allows for modeling the user's\nshort-term interests. When making incremental recommendations, only the input\nmatrices need to be updated, without the need to update parameters, which makes\nAutoSeqRec very efficient. Comprehensive evaluations demonstrate that\nAutoSeqRec outperforms existing methods in terms of accuracy, while showcasing\nits robustness and efficiency.\n","authors":["Sijia Liu","Jiahao Liu","Hansu Gu","Dongsheng Li","Tun Lu","Peng Zhang","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.06878v1.pdf","comment":"10 pages, accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2210.07346v2","updated":"2023-08-14T01:07:38Z","published":"2022-10-13T20:39:21Z","title":"An Embarrassingly Simple Backdoor Attack on Self-supervised Learning","summary":"  As a new paradigm in machine learning, self-supervised learning (SSL) is\ncapable of learning high-quality representations of complex data without\nrelying on labels. In addition to eliminating the need for labeled data,\nresearch has found that SSL improves the adversarial robustness over supervised\nlearning since lacking labels makes it more challenging for adversaries to\nmanipulate model predictions. However, the extent to which this robustness\nsuperiority generalizes to other types of attacks remains an open question.\n  We explore this question in the context of backdoor attacks. Specifically, we\ndesign and evaluate CTRL, an embarrassingly simple yet highly effective\nself-supervised backdoor attack. By only polluting a tiny fraction of training\ndata (<= 1%) with indistinguishable poisoning samples, CTRL causes any\ntrigger-embedded input to be misclassified to the adversary's designated class\nwith a high probability (>= 99%) at inference time. Our findings suggest that\nSSL and supervised learning are comparably vulnerable to backdoor attacks. More\nimportantly, through the lens of CTRL, we study the inherent vulnerability of\nSSL to backdoor attacks. With both empirical and analytical evidence, we reveal\nthat the representation invariance property of SSL, which benefits adversarial\nrobustness, may also be the very reason making \\ssl highly susceptible to\nbackdoor attacks. Our findings also imply that the existing defenses against\nsupervised backdoor attacks are not easily retrofitted to the unique\nvulnerability of SSL.\n","authors":["Changjiang Li","Ren Pang","Zhaohan Xi","Tianyu Du","Shouling Ji","Yuan Yao","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2210.07346v2.pdf","comment":"The 2023 International Conference on Computer Vision (ICCV '23)"},{"id":"http://arxiv.org/abs/2308.06873v1","updated":"2023-08-14T01:01:19Z","published":"2023-08-14T01:01:19Z","title":"SpeechX: Neural Codec Language Model as a Versatile Speech Transformer","summary":"  Recent advancements in generative speech models based on audio-text prompts\nhave enabled remarkable innovations like high-quality zero-shot text-to-speech.\nHowever, existing models still face limitations in handling diverse audio-text\nspeech generation tasks involving transforming input speech and processing\naudio captured in adverse acoustic conditions. This paper introduces SpeechX, a\nversatile speech generation model capable of zero-shot TTS and various speech\ntransformation tasks, dealing with both clean and noisy signals. SpeechX\ncombines neural codec language modeling with multi-task learning using\ntask-dependent prompting, enabling unified and extensible modeling and\nproviding a consistent way for leveraging textual input in speech enhancement\nand transformation tasks. Experimental results show SpeechX's efficacy in\nvarious tasks, including zero-shot TTS, noise suppression, target speaker\nextraction, speech removal, and speech editing with or without background\nnoise, achieving comparable or superior performance to specialized models\nacross tasks. See https://aka.ms/speechx for demo samples.\n","authors":["Xiaofei Wang","Manthan Thakker","Zhuo Chen","Naoyuki Kanda","Sefik Emre Eskimez","Sanyuan Chen","Min Tang","Shujie Liu","Jinyu Li","Takuya Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2308.06873v1.pdf","comment":"See https://aka.ms/speechx for demo samples"},{"id":"http://arxiv.org/abs/2307.10644v2","updated":"2023-08-14T00:55:01Z","published":"2023-07-20T07:14:58Z","title":"Fisher-Rao distance and pullback SPD cone distances between multivariate\n  normal distributions","summary":"  Data sets of multivariate normal distributions abound in many scientific\nareas like diffusion tensor imaging, structure tensor computer vision, radar\nsignal processing, machine learning, just to name a few. In order to process\nthose normal data sets for downstream tasks like filtering, classification or\nclustering, one needs to define proper notions of dissimilarities between\nnormals and paths joining them. The Fisher-Rao distance defined as the\nRiemannian geodesic distance induced by the Fisher information metric is such a\nprincipled metric distance which however is not known in closed-form excepts\nfor a few particular cases. In this work, we first report a fast and robust\nmethod to approximate arbitrarily finely the Fisher-Rao distance between\nmultivariate normal distributions. Second, we introduce a class of distances\nbased on diffeomorphic embeddings of the normal manifold into a submanifold of\nthe higher-dimensional symmetric positive-definite cone corresponding to the\nmanifold of centered normal distributions. We show that the projective Hilbert\ndistance on the cone yields a metric on the embedded normal submanifold and we\npullback that cone distance with its associated straight line Hilbert cone\ngeodesics to obtain a distance and smooth paths between normal distributions.\nCompared to the Fisher-Rao distance approximation, the pullback Hilbert cone\ndistance is computationally light since it requires to compute only the extreme\nminimal and maximal eigenvalues of matrices. Finally, we show how to use those\ndistances in clustering tasks.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2307.10644v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2211.00164v2","updated":"2023-08-14T00:16:23Z","published":"2022-10-31T22:12:48Z","title":"Agent-Controller Representations: Principled Offline RL with Rich\n  Exogenous Information","summary":"  Learning to control an agent from data collected offline in a rich\npixel-based visual observation space is vital for real-world applications of\nreinforcement learning (RL). A major challenge in this setting is the presence\nof input information that is hard to model and irrelevant to controlling the\nagent. This problem has been approached by the theoretical RL community through\nthe lens of exogenous information, i.e, any control-irrelevant information\ncontained in observations. For example, a robot navigating in busy streets\nneeds to ignore irrelevant information, such as other people walking in the\nbackground, textures of objects, or birds in the sky. In this paper, we focus\non the setting with visually detailed exogenous information, and introduce new\noffline RL benchmarks offering the ability to study this problem. We find that\ncontemporary representation learning techniques can fail on datasets where the\nnoise is a complex and time dependent process, which is prevalent in practical\napplications. To address these, we propose to use multi-step inverse models,\nwhich have seen a great deal of interest in the RL theory community, to learn\nAgent-Controller Representations for Offline-RL (ACRO). Despite being simple\nand requiring no reward, we show theoretically and empirically that the\nrepresentation created by this objective greatly outperforms baselines.\n","authors":["Riashat Islam","Manan Tomar","Alex Lamb","Yonathan Efroni","Hongyu Zang","Aniket Didolkar","Dipendra Misra","Xin Li","Harm van Seijen","Remi Tachet des Combes","John Langford"],"pdf_url":"https://arxiv.org/pdf/2211.00164v2.pdf","comment":"ICML 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.07316v1","updated":"2023-08-14T17:59:31Z","published":"2023-08-14T17:59:31Z","title":"Jurassic World Remake: Bringing Ancient Fossils Back to Life via\n  Zero-Shot Long Image-to-Image Translation","summary":"  With a strong understanding of the target domain from natural language, we\nproduce promising results in translating across large domain gaps and bringing\nskeletons back to life. In this work, we use text-guided latent diffusion\nmodels for zero-shot image-to-image translation (I2I) across large domain gaps\n(longI2I), where large amounts of new visual features and new geometry need to\nbe generated to enter the target domain. Being able to perform translations\nacross large domain gaps has a wide variety of real-world applications in\ncriminology, astrology, environmental conservation, and paleontology. In this\nwork, we introduce a new task Skull2Animal for translating between skulls and\nliving animals. On this task, we find that unguided Generative Adversarial\nNetworks (GANs) are not capable of translating across large domain gaps.\nInstead of these traditional I2I methods, we explore the use of guided\ndiffusion and image editing models and provide a new benchmark model,\nRevive-2I, capable of performing zero-shot I2I via text-prompting latent\ndiffusion models. We find that guidance is necessary for longI2I because, to\nbridge the large domain gap, prior knowledge about the target domain is needed.\nIn addition, we find that prompting provides the best and most scalable\ninformation about the target domain as classifier-guided diffusion models\nrequire retraining for specific use cases and lack stronger constraints on the\ntarget domain because of the wide variety of images they are trained on.\n","authors":["Alexander Martin","Haitian Zheng","Jie An","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07316v1.pdf","comment":"9 pages, 10 figures, ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2304.09571v2","updated":"2023-08-14T15:15:46Z","published":"2023-04-19T11:19:10Z","title":"SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability\n  for Learned Image Compression","summary":"  Recently, transformers are trending as replacements for CNNs in vision tasks,\nincluding compression. This trend compels us to question the inherent\nlimitations of CNNs compared to transformers and to explore if CNNs can be\nenhanced to achieve the same or even better performance than transformers. We\nwant to design a pure CNN based model for compression as most devices are\noptimized for CNNs well. In our analysis, we find that the key strengths of\ntransformers lie in their dynamic weights and large receptive fields. To enable\nCNNs with such properties, we propose a novel transform module with large\nreceptive filed learning and self-conditioned adaptability for learned image\ncompression, named SLIC. Specifically, we enlarge the receptive field of\ndepth-wise convolution with suitable complexity and generate the weights\naccording to given conditions. In addition, we also investigate the\nself-conditioned factor for channels. To prove the effectiveness of our\nproposed transform module, we equip it with existing entropy models ChARM,\nSCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten have significant improvements over corresponding baselines and\nachieve SOTA performances with suitable complexity on 5 test datasets (Kodak,\nTecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at\nhttps://github.com/JiangWeibeta/SLIC.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.07146v1","updated":"2023-08-14T13:53:18Z","published":"2023-08-14T13:53:18Z","title":"CTP: Towards Vision-Language Continual Pretraining via Compatible\n  Momentum Contrast and Topology Preservation","summary":"  Vision-Language Pretraining (VLP) has shown impressive results on diverse\ndownstream tasks by offline training on large-scale datasets. Regarding the\ngrowing nature of real-world data, such an offline training paradigm on\never-expanding data is unsustainable, because models lack the continual\nlearning ability to accumulate knowledge constantly. However, most continual\nlearning studies are limited to uni-modal classification and existing\nmulti-modal datasets cannot simulate continual non-stationary data stream\nscenarios. To support the study of Vision-Language Continual Pretraining\n(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D\nwhich contains over one million product image-text pairs from 9 industries. The\ndata from each industry as an independent task supports continual learning and\nconforms to the real-world long-tail nature to simulate pretraining on web\ndata. We comprehensively study the characteristics and challenges of VLCP, and\npropose a new algorithm: Compatible momentum contrast with Topology\nPreservation, dubbed CTP. The compatible momentum model absorbs the knowledge\nof the current and previous-task models to flexibly update the modal feature.\nMoreover, Topology Preservation transfers the knowledge of embedding across\ntasks while preserving the flexibility of feature adjustment. The experimental\nresults demonstrate our method not only achieves superior performance compared\nwith other baselines but also does not bring an expensive training burden.\nDataset and codes are available at https://github.com/KevinLight831/CTP.\n","authors":["Hongguang Zhu","Yunchao Wei","Xiaodan Liang","Chunjie Zhang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.07146v1.pdf","comment":"Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP"},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":"  This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.07056v1","updated":"2023-08-14T10:31:29Z","published":"2023-08-14T10:31:29Z","title":"VoxSnap: X-Large Speaker Verification Dataset on Camera","summary":"  In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxSnap)\nand relatively cleaned 18k identities/1.02M (VoxSnap-Clean) utterances for\ntraining. Firstly, we collect a 60K+ users' list as well as their avatar and\ndownload their SHORT videos on the YouTube. Then, an automatically pipeline is\ndevised to extract target user's speech segments and videos, which is efficient\nand scalable. To the best of our knowledge, the VoxSnap dataset is the largest\nspeaker recognition dataset. Secondly, we develop a series of experiments based\non VoxSnap-clean together with VoxCeleb2. Our findings highlight a notable\nimprovement in performance, ranging from 15% to 30%, across different backbone\narchitectures, upon integrating our dataset for training. The dataset will be\nreleased SOON~.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ning Jiang","Guoqing Zhao","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v1.pdf","comment":"submit to ICASSP2023"},{"id":"http://arxiv.org/abs/2306.02898v4","updated":"2023-08-14T07:37:27Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n  Multi-Attribute and Language Search Benchmark","summary":"  In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05995v2","updated":"2023-08-14T03:27:40Z","published":"2023-08-11T08:03:28Z","title":"Audio is all in one: speech-driven gesture synthetics using WavLM\n  pre-trained model","summary":"  The generation of co-speech gestures for digital humans is an emerging area\nin the field of virtual human creation. Prior research has made progress by\nusing acoustic and semantic information as input and adopting classify method\nto identify the person's ID and emotion for driving co-speech gesture\ngeneration. However, this endeavour still faces significant challenges. These\nchallenges go beyond the intricate interplay between co-speech gestures, speech\nacoustic, and semantics; they also encompass the complexities associated with\npersonality, emotion, and other obscure but important factors. This paper\nintroduces \"diffmotion-v2,\" a speech-conditional diffusion-based and\nnon-autoregressive transformer-based generative model with WavLM pre-trained\nmodel. It can produce individual and stylized full-body co-speech gestures only\nusing raw speech audio, eliminating the need for complex multimodal processing\nand manually annotated. Firstly, considering that speech audio not only\ncontains acoustic and semantic features but also conveys personality traits,\nemotions, and more subtle information related to accompanying gestures, we\npioneer the adaptation of WavLM, a large-scale pre-trained model, to extract\nlow-level and high-level audio information. Secondly, we introduce an adaptive\nlayer norm architecture in the transformer-based layer to learn the\nrelationship between speech information and accompanying gestures. Extensive\nsubjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT\ndatasets to confirm the WavLM and the model's ability to synthesize natural\nco-speech gestures with various styles.\n","authors":["Fan Zhang","Naye Ji","Fuxing Gao","Siyuan Zhao","Zhaohan Wang","Shunman Li"],"pdf_url":"https://arxiv.org/pdf/2308.05995v2.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.06897v1","updated":"2023-08-14T02:26:49Z","published":"2023-08-14T02:26:49Z","title":"Orthogonal Temporal Interpolation for Zero-Shot Video Recognition","summary":"  Zero-shot video recognition (ZSVR) is a task that aims to recognize video\ncategories that have not been seen during the model training process. Recently,\nvision-language models (VLMs) pre-trained on large-scale image-text pairs have\ndemonstrated impressive transferability for ZSVR. To make VLMs applicable to\nthe video domain, existing methods often use an additional temporal learning\nmodule after the image-level encoder to learn the temporal relationships among\nvideo frames. Unfortunately, for video from unseen categories, we observe an\nabnormal phenomenon where the model that uses spatial-temporal feature performs\nmuch worse than the model that removes temporal learning module and uses only\nspatial feature. We conjecture that improper temporal modeling on video\ndisrupts the spatial feature of the video. To verify our hypothesis, we propose\nFeature Factorization to retain the orthogonal temporal feature of the video\nand use interpolation to construct refined spatial-temporal feature. The model\nusing appropriately refined spatial-temporal feature performs better than the\none using only spatial feature, which verifies the effectiveness of the\northogonal temporal feature for the ZSVR task. Therefore, an Orthogonal\nTemporal Interpolation module is designed to learn a better refined\nspatial-temporal video feature during training. Additionally, a Matching Loss\nis introduced to improve the quality of the orthogonal temporal feature. We\npropose a model called OTI for ZSVR by employing orthogonal temporal\ninterpolation and the matching loss based on VLMs. The ZSVR accuracies on\npopular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI\noutperforms the previous state-of-the-art method by a clear margin.\n","authors":["Yan Zhu","Junbao Zhuo","Bin Ma","Jiajia Geng","Xiaoming Wei","Xiaolin Wei","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06897v1.pdf","comment":null}]},"2023-08-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2206.07920v2","updated":"2023-08-13T20:53:43Z","published":"2022-06-16T04:50:00Z","title":"PInKS: Preconditioned Commonsense Inference with Minimal Supervision","summary":"  Reasoning with preconditions such as \"glass can be used for drinking water\nunless the glass is shattered\" remains an open problem for language models. The\nmain challenge lies in the scarcity of preconditions data and the model's lack\nof support for such reasoning. We present PInKS, Preconditioned Commonsense\nInference with WeaK Supervision, an improved model for reasoning with\npreconditions through minimum supervision. We show, both empirically and\ntheoretically, that PInKS improves the results on benchmarks focused on\nreasoning with the preconditions of commonsense knowledge (up to 40% Macro-F1\nscores). We further investigate PInKS through PAC-Bayesian informativeness\nanalysis, precision measures, and ablation study.\n","authors":["Ehsan Qasemi","Piyush Khanna","Qiang Ning","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2206.07920v2.pdf","comment":"AACL 2022"},{"id":"http://arxiv.org/abs/2104.08712v3","updated":"2023-08-13T20:32:28Z","published":"2021-04-18T04:37:54Z","title":"PaCo: Preconditions Attributed to Commonsense Knowledge","summary":"  Humans can seamlessly reason with circumstantial preconditions of commonsense\nknowledge. We understand that a glass is used for drinking water, unless the\nglass is broken or the water is toxic. Despite state-of-the-art (SOTA) language\nmodels' (LMs) impressive performance on inferring commonsense knowledge, it is\nunclear whether they understand the circumstantial preconditions. To address\nthis gap, we propose a novel challenge of reasoning with circumstantial\npreconditions. We collect a dataset, called PaCo, consisting of 12.4 thousand\npreconditions of commonsense statements expressed in natural language. Based on\nthis dataset, we create three canonical evaluation tasks and use them to\nexamine the capability of existing LMs to understand situational preconditions.\nOur results reveal a 10-30% gap between machine and human performance on our\ntasks, which shows that reasoning with preconditions is an open challenge.\n","authors":["Ehsan Qasemi","Filip Ilievski","Muhao Chen","Pedro Szekely"],"pdf_url":"https://arxiv.org/pdf/2104.08712v3.pdf","comment":"EMNLP 2022 (Findings)"},{"id":"http://arxiv.org/abs/2308.06834v1","updated":"2023-08-13T19:04:07Z","published":"2023-08-13T19:04:07Z","title":"Diagnostic Reasoning Prompts Reveal the Potential for Large Language\n  Model Interpretability in Medicine","summary":"  One of the major barriers to using large language models (LLMs) in medicine\nis the perception they use uninterpretable methods to make clinical decisions\nthat are inherently different from the cognitive processes of clinicians. In\nthis manuscript we develop novel diagnostic reasoning prompts to study whether\nLLMs can perform clinical reasoning to accurately form a diagnosis. We find\nthat GPT4 can be prompted to mimic the common clinical reasoning processes of\nclinicians without sacrificing diagnostic accuracy. This is significant because\nan LLM that can use clinical reasoning to provide an interpretable rationale\noffers physicians a means to evaluate whether LLMs can be trusted for patient\ncare. Novel prompting methods have the potential to expose the black box of\nLLMs, bringing them one step closer to safe and effective use in medicine.\n","authors":["Thomas Savage","Ashwin Nayak","Robert Gallo","Ekanath Rangan","Jonathan H Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06828v1","updated":"2023-08-13T18:14:10Z","published":"2023-08-13T18:14:10Z","title":"An Ensemble Approach to Question Classification: Integrating Electra\n  Transformer, GloVe, and LSTM","summary":"  This paper introduces a novel ensemble approach for question classification\nusing state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model\nis trained and evaluated on the TREC dataset, a well-established benchmark for\nquestion classification tasks. The ensemble model combines the strengths of\nElectra, a transformer-based model for language understanding, GloVe, a global\nvectors for word representation, and LSTM, a recurrent neural network variant,\nproviding a robust and efficient solution for question classification.\nExtensive experiments were carried out to compare the performance of the\nproposed ensemble approach with other cutting-edge models, such as BERT,\nRoBERTa, and DistilBERT. Our results demonstrate that the ensemble model\noutperforms these models across all evaluation metrics, achieving an accuracy\nof 0.8 on the test set. These findings underscore the effectiveness of the\nensemble approach in enhancing the performance of question classification\ntasks, and invite further exploration of ensemble methods in natural language\nprocessing.\n","authors":["Sanad Aburass","Osama Dorgham"],"pdf_url":"https://arxiv.org/pdf/2308.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06795v1","updated":"2023-08-13T15:44:39Z","published":"2023-08-13T15:44:39Z","title":"Faithful to Whom? Questioning Interpretability Measures in NLP","summary":"  A common approach to quantifying model interpretability is to calculate\nfaithfulness metrics based on iteratively masking input tokens and measuring\nhow much the predicted label changes as a result. However, we show that such\nmetrics are generally not suitable for comparing the interpretability of\ndifferent neural text classifiers as the response to masked inputs is highly\nmodel-specific. We demonstrate that iterative masking can produce large\nvariation in faithfulness scores between comparable models, and show that\nmasked samples are frequently outside the distribution seen during training. We\nfurther investigate the impact of adversarial attacks and adversarial training\non faithfulness scores, and demonstrate the relevance of faithfulness measures\nfor analyzing feature salience in text adversarial attacks. Our findings\nprovide new insights into the limitations of current faithfulness metrics and\nkey considerations to utilize them appropriately.\n","authors":["Evan Crothers","Herna Viktor","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2308.06795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13099v3","updated":"2023-08-13T15:06:43Z","published":"2023-03-23T08:30:35Z","title":"Multi-View Zero-Shot Open Intent Induction from Dialogues: Multi Domain\n  Batch and Proxy Gradient Transfer","summary":"  In Task Oriented Dialogue (TOD) system, detecting and inducing new intents\nare two main challenges to apply the system in the real world. In this paper,\nwe suggest the semantic multi-view model to resolve these two challenges: (1)\nSBERT for General Embedding (GE), (2) Multi Domain Batch (MDB) for dialogue\ndomain knowledge, and (3) Proxy Gradient Transfer (PGT) for cluster-specialized\nsemantic. MDB feeds diverse dialogue datasets to the model at once to tackle\nthe multi-domain problem by learning the multiple domain knowledge. We\nintroduce a novel method PGT, which employs the Siamese network to fine-tune\nthe model with a clustering method directly.Our model can learn how to cluster\ndialogue utterances by using PGT. Experimental results demonstrate that our\nmulti-view model with MDB and PGT significantly improves the Open Intent\nInduction performance compared to baseline systems.\n","authors":["Hyukhun Koh","Haesung Pyun","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2303.13099v3.pdf","comment":"8 pages, 3 figures, SIGDIAL DSTC 2023 workshop"},{"id":"http://arxiv.org/abs/2308.06788v1","updated":"2023-08-13T15:03:31Z","published":"2023-08-13T15:03:31Z","title":"Modeling the Dashboard Provenance","summary":"  Organizations of all kinds, whether public or private, profit-driven or\nnon-profit, and across various industries and sectors, rely on dashboards for\neffective data visualization. However, the reliability and efficacy of these\ndashboards rely on the quality of the visual and data they present. Studies\nshow that less than a quarter of dashboards provide information about their\nsources, which is just one of the expected metadata when provenance is\nseriously considered. Provenance is a record that describes people,\norganizations, entities, and activities that had a role in the production,\ninfluence, or delivery of a piece of data or an object. This paper aims to\nprovide a provenance representation model, that entitles standardization,\nmodeling, generation, capture, and visualization, specifically designed for\ndashboards and its visual and data components. The proposed model will offer a\ncomprehensive set of essential provenance metadata that enables users to\nevaluate the quality, consistency, and reliability of the information presented\non dashboards. This will allow a clear and precise understanding of the context\nin which a specific dashboard was developed, ultimately leading to better\ndecision-making.\n","authors":["Johne Jarske","Jorge Rady","Lucia V. L. Filgueiras","Leandro M. Velloso","Tania L. Santos"],"pdf_url":"https://arxiv.org/pdf/2308.06788v1.pdf","comment":"8 pages, 4 figures, one table, to be published in VIS 2023 (Vis +\n  Prov) x Domain"},{"id":"http://arxiv.org/abs/2307.06281v3","updated":"2023-08-13T13:12:47Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.06744v1","updated":"2023-08-13T11:07:55Z","published":"2023-08-13T11:07:55Z","title":"Token-Scaled Logit Distillation for Ternary Weight Generative Language\n  Models","summary":"  Generative Language Models (GLMs) have shown impressive performance in tasks\nsuch as text generation, understanding, and reasoning. However, the large model\nsize poses challenges for practical deployment. To solve this problem,\nQuantization-Aware Training (QAT) has become increasingly popular. However,\ncurrent QAT methods for generative models have resulted in a noticeable loss of\naccuracy. To counteract this issue, we propose a novel knowledge distillation\nmethod specifically designed for GLMs. Our method, called token-scaled logit\ndistillation, prevents overfitting and provides superior learning from the\nteacher model and ground truth. This research marks the first evaluation of\nternary weight quantization-aware training of large-scale GLMs with less than\n1.0 degradation in perplexity and no loss of accuracy in a reasoning task.\n","authors":["Minsoo Kim","Sihwa Lee","Janghwan Lee","Sukjin Hong","Du-Seong Chang","Wonyong Sung","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2308.06744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03365v2","updated":"2023-08-13T07:02:16Z","published":"2023-08-07T07:39:43Z","title":"Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine\n  Lexicon-based Retriever","summary":"  Few-shot and zero-shot entity linking focus on the tail and emerging\nentities, which are more challenging but closer to real-world scenarios. The\nmainstream method is the ''retrieve and rerank'' two-stage framework. In this\npaper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity\ncandidates in an effective manner, which operates in two layers. The first\nlayer retrieves coarse-grained candidates by leveraging entity names, while the\nsecond layer narrows down the search to fine-grained candidates within the\ncoarse-grained ones. In addition, this second layer utilizes entity\ndescriptions to effectively disambiguate tail or new entities that share names\nwith existing popular entities. Experimental results indicate that our approach\ncan obtain superior performance without requiring extensive finetuning in the\nretrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task\n6 on Chinese Few-shot and Zero-shot Entity Linking.\n","authors":["Shijue Huang","Bingbing Wang","Libo Qin","Qin Zhao","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03365v2.pdf","comment":"Accepted to NLPCC2023"},{"id":"http://arxiv.org/abs/2308.06696v1","updated":"2023-08-13T06:29:38Z","published":"2023-08-13T06:29:38Z","title":"MACO: A Modality Adversarial and Contrastive Framework for\n  Modality-missing Multi-modal Knowledge Graph Completion","summary":"  Recent years have seen significant advancements in multi-modal knowledge\ngraph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by\nintegrating multi-modal entity information, thereby facilitating the discovery\nof unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,\nexisting methods emphasize the design of elegant KGC models to facilitate\nmodality interaction, neglecting the real-life problem of missing modalities in\nKGs. The missing modality information impedes modal interaction, consequently\nundermining the model's performance. In this paper, we propose a modality\nadversarial and contrastive framework (MACO) to solve the modality-missing\nproblem in MMKGC. MACO trains a generator and discriminator adversarially to\ngenerate missing modality features that can be incorporated into the MMKGC\nmodel. Meanwhile, we design a cross-modal contrastive loss to improve the\nperformance of the generator. Experiments on public benchmarks with further\nexplorations demonstrate that MACO could achieve state-of-the-art results and\nserve as a versatile framework to bolster various MMKGC models. Our code and\nbenchmark data are available at https://github.com/zjukg/MACO.\n","authors":["Yichi Zhang","Zhuo Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06696v1.pdf","comment":"This is the ArXiv version of our paper accepted by NLPCC 2023. The\n  code will be released soon"},{"id":"http://arxiv.org/abs/2302.08068v2","updated":"2023-08-13T03:04:08Z","published":"2023-02-16T04:06:25Z","title":"LabelPrompt: Effective Prompt-based Learning for Relation Classification","summary":"  Recently, prompt-based learning has gained popularity across many natural\nlanguage processing (NLP) tasks by reformulating them into a cloze-style format\nto better align pre-trained language models (PLMs) with downstream tasks.\nHowever, applying this approach to relation classification poses unique\nchallenges. Specifically, associating natural language words that fill the\nmasked token with semantic relation labels (\\textit{e.g.}\n\\textit{``org:founded\\_by}'') is difficult. To address this challenge, this\npaper presents a novel prompt-based learning method, namely LabelPrompt, for\nthe relation classification task. Motivated by the intuition to ``GIVE MODEL\nCHOICES!'', we first define additional tokens to represent relation labels,\nwhich regard these tokens as the verbaliser with semantic initialisation and\nexplicitly construct them with a prompt template method. Then, to mitigate\ninconsistency between predicted relations and given entities, we implement an\nentity-aware module with contrastive learning. Last, we conduct an attention\nquery strategy within the self-attention layer to differentiates prompt tokens\nand sequence tokens. Together, these strategies enhance the adaptability of\nprompt-based learning, especially when only small labelled datasets is\navailable. Comprehensive experiments on benchmark datasets demonstrate the\nsuperiority of our method, particularly in the few-shot scenario.\n","authors":["Wenjie Zhang","Xiaoning Song","Zhenhua Feng","Tianyang Xu","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2302.08068v2.pdf","comment":"20 pages, 5 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.06866v1","updated":"2023-08-13T23:52:15Z","published":"2023-08-13T23:52:15Z","title":"Improving Face Recognition from Caption Supervision with Multi-Granular\n  Contextual Feature Aggregation","summary":"  We introduce caption-guided face recognition (CGFR) as a new framework to\nimprove the performance of commercial-off-the-shelf (COTS) face recognition\n(FR) systems. In contrast to combining soft biometrics (eg., facial marks,\ngender, and age) with face images, in this work, we use facial descriptions\nprovided by face examiners as a piece of auxiliary information. However, due to\nthe heterogeneity of the modalities, improving the performance by directly\nfusing the textual and facial features is very challenging, as both lie in\ndifferent embedding spaces. In this paper, we propose a contextual feature\naggregation module (CFAM) that addresses this issue by effectively exploiting\nthe fine-grained word-region interaction and global image-caption association.\nSpecifically, CFAM adopts a self-attention and a cross-attention scheme for\nimproving the intra-modality and inter-modality relationship between the image\nand textual features, respectively. Additionally, we design a textual feature\nrefinement module (TFRM) that refines the textual features of the pre-trained\nBERT encoder by updating the contextual embeddings. This module enhances the\ndiscriminative power of textual features with a cross-modal projection loss and\nrealigns the word and caption embeddings with visual features by incorporating\na visual-semantic alignment loss. We implemented the proposed CGFR framework on\ntwo face recognition models (ArcFace and AdaFace) and evaluated its performance\non the Multi-Modal CelebA-HQ dataset. Our framework significantly improves the\nperformance of ArcFace in both 1:1 verification and 1:N identification\nprotocol.\n","authors":["Md Mahedi Hasan","Nasser Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.06866v1.pdf","comment":"This article has been accepted for publication in the IEEE\n  International Joint Conference on Biometrics (IJCB), 2023"},{"id":"http://arxiv.org/abs/2308.06861v1","updated":"2023-08-13T23:33:33Z","published":"2023-08-13T23:33:33Z","title":"Manifold DivideMix: A Semi-Supervised Contrastive Learning Framework for\n  Severe Label Noise","summary":"  Deep neural networks have proven to be highly effective when large amounts of\ndata with clean labels are available. However, their performance degrades when\ntraining data contains noisy labels, leading to poor generalization on the test\nset. Real-world datasets contain noisy label samples that either have similar\nvisual semantics to other classes (in-distribution) or have no semantic\nrelevance to any class (out-of-distribution) in the dataset. Most\nstate-of-the-art methods leverage ID labeled noisy samples as unlabeled data\nfor semi-supervised learning, but OOD labeled noisy samples cannot be used in\nthis way because they do not belong to any class within the dataset. Hence, in\nthis paper, we propose incorporating the information from all the training data\nby leveraging the benefits of self-supervised training. Our method aims to\nextract a meaningful and generalizable embedding space for each sample\nregardless of its label. Then, we employ a simple yet effective K-nearest\nneighbor method to remove portions of out-of-distribution samples. By\ndiscarding these samples, we propose an iterative \"Manifold DivideMix\"\nalgorithm to find clean and noisy samples, and train our model in a\nsemi-supervised way. In addition, we propose \"MixEMatch\", a new algorithm for\nthe semi-supervised step that involves mixup augmentation at the input and\nfinal hidden representations of the model. This will extract better\nrepresentations by interpolating both in the input and manifold spaces.\nExtensive experiments on multiple synthetic-noise image benchmarks and\nreal-world web-crawled datasets demonstrate the effectiveness of our proposed\nframework. Code is available at https://github.com/Fahim-F/ManifoldDivideMix.\n","authors":["Fahimeh Fooladgar","Minh Nguyen Nhat To","Parvin Mousavi","Purang Abolmaesumi"],"pdf_url":"https://arxiv.org/pdf/2308.06861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06853v1","updated":"2023-08-13T22:14:01Z","published":"2023-08-13T22:14:01Z","title":"UGC Quality Assessment: Exploring the Impact of Saliency in Deep\n  Feature-Based Quality Assessment","summary":"  The volume of User Generated Content (UGC) has increased in recent years. The\nchallenge with this type of content is assessing its quality. So far, the\nstate-of-the-art metrics are not exhibiting a very high correlation with\nperceptual quality. In this paper, we explore state-of-the-art metrics that\nextract/combine natural scene statistics and deep neural network features. We\nexperiment with these by introducing saliency maps to improve perceptibility.\nWe train and test our models using public datasets, namely, YouTube-UGC and\nKoNViD-1k. Preliminary results indicate that high correlations are achieved by\nusing only deep features while adding saliency is not always boosting the\nperformance. Our results and code will be made publicly available to serve as a\nbenchmark for the research community and can be found on our project page:\nhttps://github.com/xinyiW915/SPIE-2023-Supplementary.\n","authors":["Xinyi Wang","Angeliki Katsenou","David Bull"],"pdf_url":"https://arxiv.org/pdf/2308.06853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.04083v2","updated":"2023-08-13T20:49:39Z","published":"2022-04-08T14:01:41Z","title":"POSTER: A Pyramid Cross-Fusion Transformer Network for Facial Expression\n  Recognition","summary":"  Facial expression recognition (FER) is an important task in computer vision,\nhaving practical applications in areas such as human-computer interaction,\neducation, healthcare, and online monitoring. In this challenging FER task,\nthere are three key issues especially prevalent: inter-class similarity,\nintra-class discrepancy, and scale sensitivity. While existing works typically\naddress some of these issues, none have fully addressed all three challenges in\na unified framework. In this paper, we propose a two-stream Pyramid\ncrOss-fuSion TransformER network (POSTER), that aims to holistically solve all\nthree issues. Specifically, we design a transformer-based cross-fusion method\nthat enables effective collaboration of facial landmark features and image\nfeatures to maximize proper attention to salient facial regions. Furthermore,\nPOSTER employs a pyramid structure to promote scale invariance. Extensive\nexperimental results demonstrate that our POSTER achieves new state-of-the-art\nresults on RAF-DB (92.05%), FERPlus (91.62%), as well as AffectNet 7 class\n(67.31%) and 8 class (63.34%). The code is available at\nhttps://github.com/zczcwh/POSTER.\n","authors":["Ce Zheng","Matias Mendieta","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.04083v2.pdf","comment":"ICCV Workshop (AMFG) 2023"},{"id":"http://arxiv.org/abs/2303.16874v2","updated":"2023-08-13T20:11:23Z","published":"2023-03-29T17:30:53Z","title":"CheckerPose: Progressive Dense Keypoint Localization for Object Pose\n  Estimation with Graph Neural Network","summary":"  Estimating the 6-DoF pose of a rigid object from a single RGB image is a\ncrucial yet challenging task. Recent studies have shown the great potential of\ndense correspondence-based solutions, yet improvements are still needed to\nreach practical deployment. In this paper, we propose a novel pose estimation\nalgorithm named CheckerPose, which improves on three main aspects. Firstly,\nCheckerPose densely samples 3D keypoints from the surface of the 3D object and\nfinds their 2D correspondences progressively in the 2D image. Compared to\nprevious solutions that conduct dense sampling in the image space, our strategy\nenables the correspondence searching in a 2D grid (i.e., pixel coordinate).\nSecondly, for our 3D-to-2D correspondence, we design a compact binary code\nrepresentation for 2D image locations. This representation not only allows for\nprogressive correspondence refinement but also converts the correspondence\nregression to a more efficient classification problem. Thirdly, we adopt a\ngraph neural network to explicitly model the interactions among the sampled 3D\nkeypoints, further boosting the reliability and accuracy of the\ncorrespondences. Together, these novel components make CheckerPose a strong\npose estimation algorithm. When evaluated on the popular Linemod, Linemod-O,\nand YCB-V object pose estimation benchmarks, CheckerPose clearly boosts the\naccuracy of correspondence-based methods and achieves state-of-the-art\nperformances. Code is available at https://github.com/RuyiLian/CheckerPose.\n","authors":["Ruyi Lian","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2303.16874v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06821v1","updated":"2023-08-13T17:30:32Z","published":"2023-08-13T17:30:32Z","title":"Optimizing Brain Tumor Classification: A Comprehensive Study on Transfer\n  Learning and Imbalance Handling in Deep Learning Models","summary":"  Deep learning has emerged as a prominent field in recent literature,\nshowcasing the introduction of models that utilize transfer learning to achieve\nremarkable accuracies in the classification of brain tumor MRI images. However,\nthe majority of these proposals primarily focus on balanced datasets,\nneglecting the inherent data imbalance present in real-world scenarios.\nConsequently, there is a pressing need for approaches that not only address the\ndata imbalance but also prioritize precise classification of brain cancer. In\nthis work, we present a novel deep learning-based approach, called Transfer\nLearning-CNN, for brain tumor classification using MRI data. The proposed model\nleverages the predictive capabilities of existing publicly available models by\nutilizing their pre-trained weights and transferring those weights to the CNN.\nBy leveraging a publicly available Brain MRI dataset, the experiment evaluated\nvarious transfer learning models for classifying different tumor types,\nincluding meningioma, glioma, and pituitary tumors. We investigate the impact\nof different loss functions, including focal loss, and oversampling methods,\nsuch as SMOTE and ADASYN, in addressing the data imbalance issue. Notably, the\nproposed strategy, which combines VGG-16 and CNN, achieved an impressive\naccuracy rate of 96%, surpassing alternative approaches significantly.\n","authors":["Raza Imam","Mohammed Talha Alam"],"pdf_url":"https://arxiv.org/pdf/2308.06821v1.pdf","comment":"Our code is available at\n  https://github.com/Razaimam45/AI701-Project-Transfer-Learning-approach-for-imbalance-classification-of-Brain-Tumor-MRI-"},{"id":"http://arxiv.org/abs/2301.06719v5","updated":"2023-08-13T17:25:45Z","published":"2023-01-17T06:24:08Z","title":"FemtoDet: An Object Detection Baseline for Energy Versus Performance\n  Tradeoffs","summary":"  Efficient detectors for edge devices are often optimized for parameters or\nspeed count metrics, which remain in weak correlation with the energy of\ndetectors.\n  However, some vision applications of convolutional neural networks, such as\nalways-on surveillance cameras, are critical for energy constraints.\n  This paper aims to serve as a baseline by designing detectors to reach\ntradeoffs between energy and performance from two perspectives:\n  1) We extensively analyze various CNNs to identify low-energy architectures,\nincluding selecting activation functions, convolutions operators, and feature\nfusion structures on necks. These underappreciated details in past work\nseriously affect the energy consumption of detectors;\n  2) To break through the dilemmatic energy-performance problem, we propose a\nbalanced detector driven by energy using discovered low-energy components named\n\\textit{FemtoDet}.\n  In addition to the novel construction, we improve FemtoDet by considering\nconvolutions and training strategy optimizations.\n  Specifically, we develop a new instance boundary enhancement (IBE) module for\nconvolution optimization to overcome the contradiction between the limited\ncapacity of CNNs and detection tasks in diverse spatial representations, and\npropose a recursive warm-restart (RecWR) for optimizing training strategy to\nescape the sub-optimization of light-weight detectors by considering the data\nshift produced in popular augmentations.\n  As a result, FemtoDet with only 68.77k parameters achieves a competitive\nscore of 46.3 AP50 on PASCAL VOC and 1.11 W $\\&$ 64.47 FPS on Qualcomm\nSnapdragon 865 CPU platforms.\n  Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed\nmethod achieves competitive results in diverse scenes.\n","authors":["Peng Tu","Xu Xie","Guo AI","Yuexiang Li","Yawen Huang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2301.06719v5.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2211.14860v3","updated":"2023-08-13T16:37:17Z","published":"2022-11-27T15:29:39Z","title":"Foiling Explanations in Deep Neural Networks","summary":"  Deep neural networks (DNNs) have greatly impacted numerous fields over the\npast decade. Yet despite exhibiting superb performance over many problems,\ntheir black-box nature still poses a significant challenge with respect to\nexplainability. Indeed, explainable artificial intelligence (XAI) is crucial in\nseveral fields, wherein the answer alone -- sans a reasoning of how said answer\nwas derived -- is of little value. This paper uncovers a troubling property of\nexplanation methods for image-based DNNs: by making small visual changes to the\ninput image -- hardly influencing the network's output -- we demonstrate how\nexplanations may be arbitrarily manipulated through the use of evolution\nstrategies. Our novel algorithm, AttaXAI, a model-agnostic, adversarial attack\non XAI algorithms, only requires access to the output logits of a classifier\nand to the explanation map; these weak assumptions render our approach highly\nuseful where real-world models and data are concerned. We compare our method's\nperformance on two benchmark datasets -- CIFAR100 and ImageNet -- using four\ndifferent pretrained deep-learning models: VGG16-CIFAR100, VGG16-ImageNet,\nMobileNet-CIFAR100, and Inception-v3-ImageNet. We find that the XAI methods can\nbe manipulated without the use of gradients or other model internals. Our novel\nalgorithm is successfully able to manipulate an image in a manner imperceptible\nto the human eye, such that the XAI method outputs a specific explanation map.\nTo our knowledge, this is the first such method in a black-box setting, and we\nbelieve it has significant value where explainability is desired, required, or\nlegally mandatory.\n","authors":["Snir Vitrack Tamam","Raz Lapid","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2211.14860v3.pdf","comment":"Snir Vitrack Tamam and Raz Lapid contributed equally"},{"id":"http://arxiv.org/abs/2208.12697v5","updated":"2023-08-13T15:52:52Z","published":"2022-08-26T14:48:02Z","title":"Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction","summary":"  Neural surface reconstruction aims to reconstruct accurate 3D surfaces based\non multi-view images. Previous methods based on neural volume rendering mostly\ntrain a fully implicit model with MLPs, which typically require hours of\ntraining for a single scene. Recent efforts explore the explicit volumetric\nrepresentation to accelerate the optimization via memorizing significant\ninformation with learnable voxel grids. However, existing voxel-based methods\noften struggle in reconstructing fine-grained geometry, even when combined with\nan SDF-based volume rendering scheme. We reveal that this is because 1) the\nvoxel grids tend to break the color-geometry dependency that facilitates\nfine-geometry learning, and 2) the under-constrained voxel grids lack spatial\ncoherence and are vulnerable to local minima. In this work, we present Voxurf,\na voxel-based surface reconstruction approach that is both efficient and\naccurate. Voxurf addresses the aforementioned issues via several key designs,\nincluding 1) a two-stage training procedure that attains a coherent coarse\nshape and recovers fine details successively, 2) a dual color network that\nmaintains color-geometry dependency, and 3) a hierarchical geometry feature to\nencourage information propagation across voxels. Extensive experiments show\nthat Voxurf achieves high efficiency and high quality at the same time. On the\nDTU benchmark, Voxurf achieves higher reconstruction quality with a 20x\ntraining speedup compared to previous fully implicit methods. Our code is\navailable at https://github.com/wutong16/Voxurf.\n","authors":["Tong Wu","Jiaqi Wang","Xingang Pan","Xudong Xu","Christian Theobalt","Ziwei Liu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2208.12697v5.pdf","comment":"ICLR 2023 Spotlight. Our code is available at\n  https://github.com/wutong16/Voxurf"},{"id":"http://arxiv.org/abs/2308.06796v1","updated":"2023-08-13T15:45:08Z","published":"2023-08-13T15:45:08Z","title":"Modified Topological Image Preprocessing for Skin Lesion Classifications","summary":"  This paper proposes a modified Topological Data Analysis model for skin\nimages preprocessing and enhancements. The skin lesion dataset HAM10000 used\nwith the intention of identifying the important objects in relevant regions of\nthe images. In order to evaluate both the original dataset and the preprocessed\ndataset, Deep Convolutional Neural Network and Vision Transformer models were\nutilized to train both models. After training, the experimental results\ndemonstrate that the images preprocessed using the Modified Topological Data\nAnalysis consistently perform better.\n","authors":["Hong Cheng","Rebekah Leamons","Ahmad Al Shami"],"pdf_url":"https://arxiv.org/pdf/2308.06796v1.pdf","comment":"Presented at CSCE 2022, The 2022 World Congress in Computer Science,\n  Computer Engineering & Applied Computing, July 25-28, 2022, Las Vegas, USA"},{"id":"http://arxiv.org/abs/2305.09533v3","updated":"2023-08-13T15:31:58Z","published":"2023-05-16T15:26:09Z","title":"NightHazeFormer: Single Nighttime Haze Removal Using Prior Query\n  Transformer","summary":"  Nighttime image dehazing is a challenging task due to the presence of\nmultiple types of adverse degrading effects including glow, haze, blurry,\nnoise, color distortion, and so on. However, most previous studies mainly focus\non daytime image dehazing or partial degradations presented in nighttime hazy\nscenes, which may lead to unsatisfactory restoration results. In this paper, we\npropose an end-to-end transformer-based framework for nighttime haze removal,\ncalled NightHazeFormer. Our proposed approach consists of two stages:\nsupervised pre-training and semi-supervised fine-tuning. During the\npre-training stage, we introduce two powerful priors into the transformer\ndecoder to generate the non-learnable prior queries, which guide the model to\nextract specific degradations. For the fine-tuning, we combine the generated\npseudo ground truths with input real-world nighttime hazy images as paired\nimages and feed into the synthetic domain to fine-tune the pre-trained model.\nThis semi-supervised fine-tuning paradigm helps improve the generalization to\nreal domain. In addition, we also propose a large-scale synthetic dataset\ncalled UNREAL-NH, to simulate the real-world nighttime haze scenarios\ncomprehensively. Extensive experiments on several synthetic and real-world\ndatasets demonstrate the superiority of our NightHazeFormer over\nstate-of-the-art nighttime haze removal methods in terms of both visually and\nquantitatively.\n","authors":["Yun Liu","Zhongsheng Yan","Sixiang Chen","Tian Ye","Wenqi Ren","Erkang Chen"],"pdf_url":"https://arxiv.org/pdf/2305.09533v3.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.06791v1","updated":"2023-08-13T15:30:02Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D\n  Object Detector","summary":"  LIDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, inference in real-time from extremely sparse 3D data poses a\nformidable challenge. To address this issue, a common approach is to project\npoint clouds onto a bird's-eye or perspective view, effectively converting them\ninto an image-like data format. However, this excessive compression of point\ncloud data often leads to the loss of information. This paper proposes a 3D\nobject detector based on voxel and projection double branch feature extraction\n(PV-SSD) to address the problem of information loss. We add voxel features\ninput containing rich local semantic information, which is fully fused with the\nprojected features in the feature extraction stage to reduce the local\ninformation loss caused by projection. A good performance is achieved compared\nto the previous work. In addition, this paper makes the following\ncontributions: 1) a voxel feature extraction method with variable receptive\nfields is proposed; 2) a feature point sampling method by weight sampling is\nused to filter out the feature points that are more conducive to the detection\ntask; 3) the MSSFA module is proposed based on the SSFA module. To verify the\neffectiveness of our method, we designed comparison experiments.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11077v2","updated":"2023-08-13T15:23:43Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":"  The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v2.pdf","comment":"Camera Ready Version on ICCV 2023. Code and Models are publicly\n  available. Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2308.06787v1","updated":"2023-08-13T14:59:27Z","published":"2023-08-13T14:59:27Z","title":"RMP-Loss: Regularizing Membrane Potential Distribution for Spiking\n  Neural Networks","summary":"  Spiking Neural Networks (SNNs) as one of the biology-inspired models have\nreceived much attention recently. It can significantly reduce energy\nconsumption since they quantize the real-valued membrane potentials to 0/1\nspikes to transmit information thus the multiplications of activations and\nweights can be replaced by additions when implemented on hardware. However,\nthis quantization mechanism will inevitably introduce quantization error, thus\ncausing catastrophic information loss. To address the quantization error\nproblem, we propose a regularizing membrane potential loss (RMP-Loss) to adjust\nthe distribution which is directly related to quantization error to a range\nclose to the spikes. Our method is extremely simple to implement and\nstraightforward to train an SNN. Furthermore, it is shown to consistently\noutperform previous state-of-the-art methods over different network\narchitectures and datasets.\n","authors":["Yufei Guo","Xiaode Liu","Yuanpei Chen","Liwen Zhang","Weihang Peng","Yuhan Zhang","Xuhui Huang","Zhe Ma"],"pdf_url":"https://arxiv.org/pdf/2308.06787v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06781v1","updated":"2023-08-13T14:27:28Z","published":"2023-08-13T14:27:28Z","title":"Shape-guided Conditional Latent Diffusion Models for Synthesising Brain\n  Vasculature","summary":"  The Circle of Willis (CoW) is the part of cerebral vasculature responsible\nfor delivering blood to the brain. Understanding the diverse anatomical\nvariations and configurations of the CoW is paramount to advance research on\ncerebrovascular diseases and refine clinical interventions. However,\ncomprehensive investigation of less prevalent CoW variations remains\nchallenging because of the dominance of a few commonly occurring\nconfigurations. We propose a novel generative approach utilising a conditional\nlatent diffusion model with shape and anatomical guidance to generate realistic\n3D CoW segmentations, including different phenotypical variations. Our\nconditional latent diffusion model incorporates shape guidance to better\npreserve vessel continuity and demonstrates superior performance when compared\nto alternative generative models, including conditional variants of 3D GAN and\n3D VAE. We observed that our model generated CoW variants that are more\nrealistic and demonstrate higher visual fidelity than competing approaches with\nan FID score 53\\% better than the best-performing GAN-based model.\n","authors":["Yash Deo","Haoran Dou","Nishant Ravikumar","Alejandro F. Frangi","Toni Lassila"],"pdf_url":"https://arxiv.org/pdf/2308.06781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06780v1","updated":"2023-08-13T14:25:54Z","published":"2023-08-13T14:25:54Z","title":"Neural Networks at a Fraction with Pruned Quaternions","summary":"  Contemporary state-of-the-art neural networks have increasingly large numbers\nof parameters, which prevents their deployment on devices with limited\ncomputational power. Pruning is one technique to remove unnecessary weights and\nreduce resource requirements for training and inference. In addition, for ML\ntasks where the input data is multi-dimensional, using higher-dimensional data\nembeddings such as complex numbers or quaternions has been shown to reduce the\nparameter count while maintaining accuracy. In this work, we conduct pruning on\nreal and quaternion-valued implementations of different architectures on\nclassification tasks. We find that for some architectures, at very high\nsparsity levels, quaternion models provide higher accuracies than their real\ncounterparts. For example, at the task of image classification on CIFAR-10\nusing Conv-4, at $3\\%$ of the number of parameters as the original model, the\npruned quaternion version outperforms the pruned real by more than $10\\%$.\nExperiments on various network architectures and datasets show that for\ndeployment in extremely resource-constrained environments, a sparse quaternion\nnetwork might be a better candidate than a real sparse model of similar\narchitecture.\n","authors":["Sahel Mohammad Iqbal","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.06780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06777v1","updated":"2023-08-13T14:05:24Z","published":"2023-08-13T14:05:24Z","title":"Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning","summary":"  Semi-supervised learning is attracting blooming attention, due to its success\nin combining unlabeled data. To mitigate potentially incorrect pseudo labels,\nrecent frameworks mostly set a fixed confidence threshold to discard uncertain\nsamples. This practice ensures high-quality pseudo labels, but incurs a\nrelatively low utilization of the whole unlabeled set. In this work, our key\ninsight is that these uncertain samples can be turned into certain ones, as\nlong as the confusion classes for the top-1 class are detected and removed.\nInvoked by this, we propose a novel method dubbed ShrinkMatch to learn\nuncertain samples. For each uncertain sample, it adaptively seeks a shrunk\nclass space, which merely contains the original top-1 class, as well as\nremaining less likely classes. Since the confusion ones are removed in this\nspace, the re-calculated top-1 confidence can satisfy the pre-defined\nthreshold. We then impose a consistency regularization between a pair of\nstrongly and weakly augmented samples in the shrunk space to strive for\ndiscriminative representations. Furthermore, considering the varied reliability\namong uncertain samples and the gradually improved model during training, we\ncorrespondingly design two reweighting principles for our uncertain loss. Our\nmethod exhibits impressive performance on widely adopted benchmarks. Code is\navailable at https://github.com/LiheYoung/ShrinkMatch.\n","authors":["Lihe Yang","Zhen Zhao","Lei Qi","Yu Qiao","Yinghuan Shi","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.06777v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06776v1","updated":"2023-08-13T14:04:46Z","published":"2023-08-13T14:04:46Z","title":"Unsupervised Image Denoising in Real-World Scenarios via\n  Self-Collaboration Parallel Generative Adversarial Branches","summary":"  Deep learning methods have shown remarkable performance in image denoising,\nparticularly when trained on large-scale paired datasets. However, acquiring\nsuch paired datasets for real-world scenarios poses a significant challenge.\nAlthough unsupervised approaches based on generative adversarial networks offer\na promising solution for denoising without paired datasets, they are difficult\nin surpassing the performance limitations of conventional GAN-based\nunsupervised frameworks without significantly modifying existing structures or\nincreasing the computational complexity of denoisers. To address this problem,\nwe propose a SC strategy for multiple denoisers. This strategy can achieve\nsignificant performance improvement without increasing the inference complexity\nof the GAN-based denoising framework. Its basic idea is to iteratively replace\nthe previous less powerful denoiser in the filter-guided noise extraction\nmodule with the current powerful denoiser. This process generates better\nsynthetic clean-noisy image pairs, leading to a more powerful denoiser for the\nnext iteration. This baseline ensures the stability and effectiveness of the\ntraining network. The experimental results demonstrate the superiority of our\nmethod over state-of-the-art unsupervised methods.\n","authors":["Xin Lin","Chao Ren","Xiao Liu","Jie Huang","Yinjie Lei"],"pdf_url":"https://arxiv.org/pdf/2308.06776v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06774v1","updated":"2023-08-13T14:02:27Z","published":"2023-08-13T14:02:27Z","title":"Dual Meta-Learning with Longitudinally Generalized Regularization for\n  One-Shot Brain Tissue Segmentation Across the Human Lifespan","summary":"  Brain tissue segmentation is essential for neuroscience and clinical studies.\nHowever, segmentation on longitudinal data is challenging due to dynamic brain\nchanges across the lifespan. Previous researches mainly focus on\nself-supervision with regularizations and will lose longitudinal generalization\nwhen fine-tuning on a specific age group. In this paper, we propose a dual\nmeta-learning paradigm to learn longitudinally consistent representations and\npersist when fine-tuning. Specifically, we learn a plug-and-play feature\nextractor to extract longitudinal-consistent anatomical representations by\nmeta-feature learning and a well-initialized task head for fine-tuning by\nmeta-initialization learning. Besides, two class-aware regularizations are\nproposed to encourage longitudinal consistency. Experimental results on the\niSeg2019 and ADNI datasets demonstrate the effectiveness of our method. Our\ncode is available at https://github.com/ladderlab-xjtu/DuMeta.\n","authors":["Yongheng Sun","Fan Wang","Jun Shu","Haifeng Wang","Li Wang. Deyu Meng","Chunfeng Lian"],"pdf_url":"https://arxiv.org/pdf/2308.06774v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06767v1","updated":"2023-08-13T13:34:04Z","published":"2023-08-13T13:34:04Z","title":"A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,\n  and Recommendations","summary":"  Modern deep neural networks, particularly recent large language models, come\nwith massive model sizes that require significant computational and storage\nresources. To enable the deployment of modern models on resource-constrained\nenvironments and accelerate inference time, researchers have increasingly\nexplored pruning techniques as a popular research direction in neural network\ncompression. However, there is a dearth of up-to-date comprehensive review\npapers on pruning. To address this issue, in this survey, we provide a\ncomprehensive review of existing research works on deep neural network pruning\nin a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to\nprune, and 4) fusion of pruning and other compression techniques. We then\nprovide a thorough comparative analysis of seven pairs of contrast settings for\npruning (e.g., unstructured/structured) and explore emerging topics, including\npost-training pruning, different levels of supervision for pruning, and broader\napplications (e.g., adversarial robustness) to shed light on the commonalities\nand differences of existing methods and lay the foundation for further method\ndevelopment. To facilitate future research, we build a curated collection of\ndatasets, networks, and evaluations on different applications. Finally, we\nprovide some valuable recommendations on selecting pruning methods and prospect\npromising research directions. We build a repository at\nhttps://github.com/hrcheng1066/awesome-pruning.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06281v3","updated":"2023-08-13T13:12:47Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06762v1","updated":"2023-08-13T12:51:15Z","published":"2023-08-13T12:51:15Z","title":"Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance\n  from High-Quality Isotropic Volumes","summary":"  Accurate tissue segmentation of thick-slice fetal brain magnetic resonance\n(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and\nthe quantification of fetal brain development. However, this task is\nchallenging due to the use of thick-slice scans in clinically-acquired fetal\nbrain data. To address this issue, we propose to leverage high-quality\nisotropic fetal brain MR volumes (and also their corresponding annotations) as\nguidance for segmentation of thick-slice scans. Due to existence of significant\ndomain gap between high-quality isotropic volume (i.e., source data) and\nthick-slice scans (i.e., target data), we employ a domain adaptation technique\nto achieve the associated knowledge transfer (from high-quality <source>\nvolumes to thick-slice <target> scans). Specifically, we first register the\navailable high-quality isotropic fetal brain MR volumes across different\ngestational weeks to construct longitudinally-complete source data. To capture\ndomain-invariant information, we then perform Fourier decomposition to extract\nimage content and style codes. Finally, we propose a novel Cycle-Consistent\nDomain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge\nlearned from high-quality isotropic volumes for accurate tissue segmentation of\nthick-slice scans. Our C2DA-Net can fully utilize a small set of annotated\nisotropic volumes to guide tissue segmentation on unannotated thick-slice\nscans. Extensive experiments on a large-scale dataset of 372 clinically\nacquired thick-slice MR scans demonstrate that our C2DA-Net achieves much\nbetter performance than cutting-edge methods quantitatively and qualitatively.\n","authors":["Shijie Huang","Xukun Zhang","Zhiming Cui","He Zhang","Geng Chen","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06762v1.pdf","comment":"10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation,\n  Unsupervised domain adaptation, Cycle-consistency"},{"id":"http://arxiv.org/abs/2308.06755v1","updated":"2023-08-13T12:23:06Z","published":"2023-08-13T12:23:06Z","title":"Influence Function Based Second-Order Channel Pruning-Evaluating True\n  Loss Changes For Pruning Is Possible Without Retraining","summary":"  A challenge of channel pruning is designing efficient and effective criteria\nto select channels to prune. A widely used criterion is minimal performance\ndegeneration. To accurately evaluate the truth performance degeneration\nrequires retraining the survived weights to convergence, which is prohibitively\nslow. Hence existing pruning methods use previous weights (without retraining)\nto evaluate the performance degeneration. However, we observe the loss changes\ndiffer significantly with and without retraining. It motivates us to develop a\ntechnique to evaluate true loss changes without retraining, with which channels\nto prune can be selected more reliably and confidently. We first derive a\nclosed-form estimator of the true loss change per pruning mask change, using\ninfluence functions without retraining. Influence function which is from robust\nstatistics reveals the impacts of a training sample on the model's prediction\nand is repurposed by us to assess impacts on true loss changes. We then show\nhow to assess the importance of all channels simultaneously and develop a novel\nglobal channel pruning algorithm accordingly. We conduct extensive experiments\nto verify the effectiveness of the proposed algorithm. To the best of our\nknowledge, we are the first that shows evaluating true loss changes for pruning\nwithout retraining is possible. This finding will open up opportunities for a\nseries of new paradigms to emerge that differ from existing pruning methods.\nThe code is available at https://github.com/hrcheng1066/IFSO.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06755v1.pdf","comment":"chrome-extension://ogjibjphoadhljaoicdnjnmgokohngcc/assets/icon-50207e67.png"},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.06749v1","updated":"2023-08-13T11:54:14Z","published":"2023-08-13T11:54:14Z","title":"FastLLVE: Real-Time Low-Light Video Enhancement with Intensity-Aware\n  Lookup Table","summary":"  Low-Light Video Enhancement (LLVE) has received considerable attention in\nrecent years. One of the critical requirements of LLVE is inter-frame\nbrightness consistency, which is essential for maintaining the temporal\ncoherence of the enhanced video. However, most existing single-image-based\nmethods fail to address this issue, resulting in flickering effect that\ndegrades the overall quality after enhancement. Moreover, 3D Convolution Neural\nNetwork (CNN)-based methods, which are designed for video to maintain\ninter-frame consistency, are computationally expensive, making them impractical\nfor real-time applications. To address these issues, we propose an efficient\npipeline named FastLLVE that leverages the Look-Up-Table (LUT) technique to\nmaintain inter-frame brightness consistency effectively. Specifically, we\ndesign a learnable Intensity-Aware LUT (IA-LUT) module for adaptive\nenhancement, which addresses the low-dynamic problem in low-light scenarios.\nThis enables FastLLVE to perform low-latency and low-complexity enhancement\noperations while maintaining high-quality results. Experimental results on\nbenchmark datasets demonstrate that our method achieves the State-Of-The-Art\n(SOTA) performance in terms of both image quality and inter-frame brightness\nconsistency. More importantly, our FastLLVE can process 1,080p videos at\n$\\mathit{50+}$ Frames Per Second (FPS), which is $\\mathit{2 \\times}$ faster\nthan SOTA CNN-based methods in inference time, making it a promising solution\nfor real-time applications. The code is available at\nhttps://github.com/Wenhao-Li-777/FastLLVE.\n","authors":["Wenhao Li","Guangyang Wu","Wenyi Wang","Peiran Ren","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06749v1.pdf","comment":"11pages, 9 Figures, and 6 Tables. Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.06748v1","updated":"2023-08-13T11:49:05Z","published":"2023-08-13T11:49:05Z","title":"Target before Shooting: Accurate Anomaly Detection and Localization\n  under One Millisecond via Cascade Patch Retrieval","summary":"  In this work, by re-examining the \"matching\" nature of Anomaly Detection\n(AD), we propose a new AD framework that simultaneously enjoys new records of\nAD accuracy and dramatically high running speed. In this framework, the anomaly\ndetection problem is solved via a cascade patch retrieval procedure that\nretrieves the nearest neighbors for each test image patch in a coarse-to-fine\nfashion. Given a test sample, the top-K most similar training images are first\nselected based on a robust histogram matching process. Secondly, the nearest\nneighbor of each test patch is retrieved over the similar geometrical locations\non those \"global nearest neighbors\", by using a carefully trained local metric.\nFinally, the anomaly score of each test image patch is calculated based on the\ndistance to its \"local nearest neighbor\" and the \"non-background\" probability.\nThe proposed method is termed \"Cascade Patch Retrieval\" (CPR) in this work.\nDifferent from the conventional patch-matching-based AD algorithms, CPR selects\nproper \"targets\" (reference images and locations) before \"shooting\"\n(patch-matching). On the well-acknowledged MVTec AD, BTAD and MVTec-3D AD\ndatasets, the proposed algorithm consistently outperforms all the comparing\nSOTA methods by remarkable margins, measured by various AD metrics.\nFurthermore, CPR is extremely efficient. It runs at the speed of 113 FPS with\nthe standard setting while its simplified version only requires less than 1 ms\nto process an image at the cost of a trivial accuracy drop. The code of CPR is\navailable at https://github.com/flyinghu123/CPR.\n","authors":["Hanxi Li","Jianfei Hu","Bo Li","Hao Chen","Yongbin Zheng","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06748v1.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2304.08842v2","updated":"2023-08-13T11:31:34Z","published":"2023-04-18T09:13:52Z","title":"UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite","summary":"  It is seen that there is enormous potential to leverage powerful deep\nlearning methods in the emerging field of urban digital twins. It is\nparticularly in the area of intelligent road inspection where there is\ncurrently limited research and data available. To facilitate progress in this\nfield, we have developed a well-labeled road pothole dataset named Urban\nDigital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this\ndataset will enable the use of powerful deep learning methods in urban road\ninspection, providing algorithms with a more comprehensive understanding of the\nscene and maximizing their potential. Our dataset comprises 1000 images of\npotholes, captured in various scenarios with different lighting and humidity\nconditions. Our intention is to employ this dataset for object detection,\nsemantic segmentation, and instance segmentation tasks. Our team has devoted\nsignificant effort to conducting a detailed statistical analysis, and\nbenchmarking a selection of representative algorithms from recent years. We\nalso provide a multi-task platform for researchers to fully exploit the\nperformance of various algorithms with the support of UDTIRI dataset.\n","authors":["Sicen Guo","Jiahang Li","Shuai Su","Yi Feng","Dacheng Zhou","Chen Chen","Denghuang Zhang","Xingyi Zhu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2304.08842v2.pdf","comment":"Database webpage: https://www.udtiri.com/, Kaggle webpage:\n  https://www.kaggle.com/datasets/jiahangli617/udtiri"},{"id":"http://arxiv.org/abs/2308.06746v1","updated":"2023-08-13T11:26:56Z","published":"2023-08-13T11:26:56Z","title":"Self-supervised Noise2noise Method Utilizing Corrupted Images with a\n  Modular Network for LDCT Denoising","summary":"  Deep learning is a very promising technique for low-dose computed tomography\n(LDCT) image denoising. However, traditional deep learning methods require\npaired noisy and clean datasets, which are often difficult to obtain. This\npaper proposes a new method for performing LDCT image denoising with only LDCT\ndata, which means that normal-dose CT (NDCT) is not needed. We adopt a\ncombination including the self-supervised noise2noise model and the\nnoisy-as-clean strategy. First, we add a second yet similar type of noise to\nLDCT images multiple times. Note that we use LDCT images based on the\nnoisy-as-clean strategy for corruption instead of NDCT images. Then, the\nnoise2noise model is executed with only the secondary corrupted images for\ntraining. We select a modular U-Net structure from several candidates with\nshared parameters to perform the task, which increases the receptive field\nwithout increasing the parameter size. The experimental results obtained on the\nMayo LDCT dataset show the effectiveness of the proposed method compared with\nthat of state-of-the-art deep learning methods. The developed code is available\nat https://github.com/XYuan01/Self-supervised-Noise2Noise-for-LDCT.\n","authors":["Yuting Zhu","Qiang He","Yudong Yao","Yueyang Teng"],"pdf_url":"https://arxiv.org/pdf/2308.06746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06743v1","updated":"2023-08-13T11:02:16Z","published":"2023-08-13T11:02:16Z","title":"TextDiff: Mask-Guided Residual Diffusion Models for Scene Text Image\n  Super-Resolution","summary":"  The goal of scene text image super-resolution is to reconstruct\nhigh-resolution text-line images from unrecognizable low-resolution inputs. The\nexisting methods relying on the optimization of pixel-level loss tend to yield\ntext edges that exhibit a notable degree of blurring, thereby exerting a\nsubstantial impact on both the readability and recognizability of the text. To\naddress these issues, we propose TextDiff, the first diffusion-based framework\ntailored for scene text image super-resolution. It contains two modules: the\nText Enhancement Module (TEM) and the Mask-Guided Residual Diffusion Module\n(MRD). The TEM generates an initial deblurred text image and a mask that\nencodes the spatial location of the text. The MRD is responsible for\neffectively sharpening the text edge by modeling the residuals between the\nground-truth images and the initial deblurred images. Extensive experiments\ndemonstrate that our TextDiff achieves state-of-the-art (SOTA) performance on\npublic benchmark datasets and can improve the readability of scene text images.\nMoreover, our proposed MRD module is plug-and-play that effectively sharpens\nthe text edges produced by SOTA methods. This enhancement not only improves the\nreadability and recognizability of the results generated by SOTA methods but\nalso does not require any additional joint training. Available\nCodes:https://github.com/Lenubolim/TextDiff.\n","authors":["Baolin Liu","Zongyuan Yang","Pengfei Wang","Junjie Zhou","Ziqi Liu","Ziyi Song","Yan Liu","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.06743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06739v1","updated":"2023-08-13T10:07:46Z","published":"2023-08-13T10:07:46Z","title":"Free-ATM: Exploring Unsupervised Learning on Diffusion-Generated Images\n  with Free Attention Masks","summary":"  Despite the rapid advancement of unsupervised learning in visual\nrepresentation, it requires training on large-scale datasets that demand costly\ndata collection, and pose additional challenges due to concerns regarding data\nprivacy. Recently, synthetic images generated by text-to-image diffusion\nmodels, have shown great potential for benefiting image recognition. Although\npromising, there has been inadequate exploration dedicated to unsupervised\nlearning on diffusion-generated images. To address this, we start by uncovering\nthat diffusion models' cross-attention layers inherently provide\nannotation-free attention masks aligned with corresponding text inputs on\ngenerated images. We then investigate the problems of three prevalent\nunsupervised learning techniques ( i.e., contrastive learning, masked modeling,\nand vision-language pretraining) and introduce customized solutions by fully\nexploiting the aforementioned free attention masks. Our approach is validated\nthrough extensive experiments that show consistent improvements in baseline\nmodels across various downstream tasks, including image classification,\ndetection, segmentation, and image-text retrieval. By utilizing our method, it\nis possible to close the performance gap between unsupervised pretraining on\nsynthetic data and real-world scenarios.\n","authors":["David Junhao Zhang","Mutian Xu","Chuhui Xue","Wenqing Zhang","Xiaoguang Han","Song Bai","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2308.06739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06735v1","updated":"2023-08-13T09:55:04Z","published":"2023-08-13T09:55:04Z","title":"AerialVLN: Vision-and-Language Navigation for UAVs","summary":"  Recently emerged Vision-and-Language Navigation (VLN) tasks have drawn\nsignificant attention in both computer vision and natural language processing\ncommunities. Existing VLN tasks are built for agents that navigate on the\nground, either indoors or outdoors. However, many tasks require intelligent\nagents to carry out in the sky, such as UAV-based goods delivery,\ntraffic/security patrol, and scenery tour, to name a few. Navigating in the sky\nis more complicated than on the ground because agents need to consider the\nflying height and more complex spatial relationship reasoning. To fill this gap\nand facilitate research in this field, we propose a new task named AerialVLN,\nwhich is UAV-based and towards outdoor environments. We develop a 3D simulator\nrendered by near-realistic pictures of 25 city-level scenarios. Our simulator\nsupports continuous navigation, environment extension and configuration. We\nalso proposed an extended baseline model based on the widely-used\ncross-modal-alignment (CMA) navigation methods. We find that there is still a\nsignificant gap between the baseline model and human performance, which\nsuggests AerialVLN is a new challenging task. Dataset and code is available at\nhttps://github.com/AirVLN/AirVLN.\n","authors":["Shubo Liu","Hongsheng Zhang","Yuankai Qi","Peng Wang","Yaning Zhang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06735v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.16196v2","updated":"2023-08-13T09:35:26Z","published":"2023-03-28T17:58:05Z","title":"SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis","summary":"  Neural Radiance Field (NeRF) significantly degrades when only a limited\nnumber of views are available. To complement the lack of 3D information,\ndepth-based models, such as DSNeRF and MonoSDF, explicitly assume the\navailability of accurate depth maps of multiple views. They linearly scale the\naccurate depth maps as supervision to guide the predicted depth of few-shot\nNeRFs. However, accurate depth maps are difficult and expensive to capture due\nto wide-range depth distances in the wild.\n  In this work, we present a new Sparse-view NeRF (SparseNeRF) framework that\nexploits depth priors from real-world inaccurate observations. The inaccurate\ndepth observations are either from pre-trained depth models or coarse depth\nmaps of consumer-level depth sensors. Since coarse depth maps are not strictly\nscaled to the ground-truth depth maps, we propose a simple yet effective\nconstraint, a local depth ranking method, on NeRFs such that the expected depth\nranking of the NeRF is consistent with that of the coarse depth maps in local\npatches. To preserve the spatial continuity of the estimated depth of NeRF, we\nfurther propose a spatial continuity constraint to encourage the consistency of\nthe expected depth continuity of NeRF with coarse depth maps. Surprisingly,\nwith simple depth ranking constraints, SparseNeRF outperforms all\nstate-of-the-art few-shot NeRF methods (including depth-based models) on\nstandard LLFF and DTU datasets. Moreover, we collect a new dataset NVS-RGBD\nthat contains real-world depth maps from Azure Kinect, ZED 2, and iPhone 13\nPro. Extensive experiments on NVS-RGBD dataset also validate the superiority\nand generalizability of SparseNeRF. Code and dataset are available at\nhttps://sparsenerf.github.io/.\n","authors":["Guangcong Wang","Zhaoxi Chen","Chen Change Loy","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2303.16196v2.pdf","comment":"Accepted by ICCV 2023, Project page: https://sparsenerf.github.io/"},{"id":"http://arxiv.org/abs/2305.01643v2","updated":"2023-08-13T09:25:18Z","published":"2023-05-02T17:55:38Z","title":"Neural LiDAR Fields for Novel View Synthesis","summary":"  We present Neural Fields for LiDAR (NFL), a method to optimise a neural field\nscene representation from LiDAR measurements, with the goal of synthesizing\nrealistic LiDAR scans from novel viewpoints. NFL combines the rendering power\nof neural fields with a detailed, physically motivated model of the LiDAR\nsensing process, thus enabling it to accurately reproduce key sensor behaviors\nlike beam divergence, secondary returns, and ray dropping. We evaluate NFL on\nsynthetic and real LiDAR scans and show that it outperforms explicit\nreconstruct-then-simulate methods as well as other NeRF-style methods on LiDAR\nnovel view synthesis task. Moreover, we show that the improved realism of the\nsynthesized views narrows the domain gap to real scans and translates to better\nregistration and semantic segmentation performance.\n","authors":["Shengyu Huang","Zan Gojcic","Zian Wang","Francis Williams","Yoni Kasten","Sanja Fidler","Konrad Schindler","Or Litany"],"pdf_url":"https://arxiv.org/pdf/2305.01643v2.pdf","comment":"ICCV 2023 - camera ready. Project page:\n  https://research.nvidia.com/labs/toronto-ai/nfl/"},{"id":"http://arxiv.org/abs/2308.06725v1","updated":"2023-08-13T09:05:56Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":"  Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\n\\url{https://yuyangyin.github.io/CLEDiffusion/}\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04512v2","updated":"2023-08-13T08:41:40Z","published":"2023-04-10T11:05:20Z","title":"Defense-Prefix for Preventing Typographic Attacks on CLIP","summary":"  Vision-language pre-training models (VLPs) have exhibited revolutionary\nimprovements in various vision-language tasks. In VLP, some adversarial attacks\nfool a model into false or absurd classifications. Previous studies addressed\nthese attacks by fine-tuning the model or changing its architecture. However,\nthese methods risk losing the original model's performance and are difficult to\napply to downstream tasks. In particular, their applicability to other tasks\nhas not been considered. In this study, we addressed the reduction of the\nimpact of typographic attacks on CLIP without changing the model parameters. To\nachieve this, we expand the idea of ``prefix learning'' and introduce our\nsimple yet effective method: Defense-Prefix (DP), which inserts the DP token\nbefore a class name to make words ``robust'' against typographic attacks. Our\nmethod can be easily applied to downstream tasks, such as object detection,\nbecause the proposed method is independent of the model parameters. Our method\nsignificantly improves the accuracy of classification tasks for typographic\nattack datasets, while maintaining the zero-shot capabilities of the model. In\naddition, we leverage our proposed method for object detection, demonstrating\nits high applicability and effectiveness. The codes and datasets are available\nat https://github.com/azuma164/Defense-Prefix.\n","authors":["Hiroki Azuma","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2304.04512v2.pdf","comment":"ICCV2023 Workshop"},{"id":"http://arxiv.org/abs/2308.06721v1","updated":"2023-08-13T08:34:51Z","published":"2023-08-13T08:34:51Z","title":"IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image\n  Diffusion Models","summary":"  Recent years have witnessed the strong power of large text-to-image diffusion\nmodels for the impressive generative capability to create high-fidelity images.\nHowever, it is very tricky to generate desired images using only text prompt as\nit often involves complex prompt engineering. An alternative to text prompt is\nimage prompt, as the saying goes: \"an image is worth a thousand words\".\nAlthough existing methods of direct fine-tuning from pretrained models are\neffective, they require large computing resources and are not compatible with\nother base models, text prompt, and structural controls. In this paper, we\npresent IP-Adapter, an effective and lightweight adapter to achieve image\nprompt capability for the pretrained text-to-image diffusion models. The key\ndesign of our IP-Adapter is decoupled cross-attention mechanism that separates\ncross-attention layers for text features and image features. Despite the\nsimplicity of our method, an IP-Adapter with only 22M parameters can achieve\ncomparable or even better performance to a fully fine-tuned image prompt model.\nAs we freeze the pretrained diffusion model, the proposed IP-Adapter can be\ngeneralized not only to other custom models fine-tuned from the same base\nmodel, but also to controllable generation using existing controllable tools.\nWith the benefit of the decoupled cross-attention strategy, the image prompt\ncan also work well with the text prompt to achieve multimodal image generation.\nThe project page is available at \\url{https://ip-adapter.github.io}.\n","authors":["Hu Ye","Jun Zhang","Sibo Liu","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06719v1","updated":"2023-08-13T08:20:17Z","published":"2023-08-13T08:20:17Z","title":"3D Scene Graph Prediction on Point Clouds Using Knowledge Graphs","summary":"  3D scene graph prediction is a task that aims to concurrently predict object\nclasses and their relationships within a 3D environment. As these environments\nare primarily designed by and for humans, incorporating commonsense knowledge\nregarding objects and their relationships can significantly constrain and\nenhance the prediction of the scene graph. In this paper, we investigate the\napplication of commonsense knowledge graphs for 3D scene graph prediction on\npoint clouds of indoor scenes. Through experiments conducted on a real-world\nindoor dataset, we demonstrate that integrating external commonsense knowledge\nvia the message-passing method leads to a 15.0 % improvement in scene graph\nprediction accuracy with external knowledge and $7.96\\%$ with internal\nknowledge when compared to state-of-the-art algorithms. We also tested in the\nreal world with 10 frames per second for scene graph generation to show the\nusage of the model in a more realistic robotics setting.\n","authors":["Yiding Qiu","Henrik I. Christensen"],"pdf_url":"https://arxiv.org/pdf/2308.06719v1.pdf","comment":"accepted at CASE 2023"},{"id":"http://arxiv.org/abs/2308.06715v1","updated":"2023-08-13T08:11:40Z","published":"2023-08-13T08:11:40Z","title":"StairNetV3: Depth-aware Stair Modeling using Deep Learning","summary":"  Vision-based stair perception can help autonomous mobile robots deal with the\nchallenge of climbing stairs, especially in unfamiliar environments. To address\nthe problem that current monocular vision methods are difficult to model stairs\naccurately without depth information, this paper proposes a depth-aware stair\nmodeling method for monocular vision. Specifically, we take the extraction of\nstair geometric features and the prediction of depth images as joint tasks in a\nconvolutional neural network (CNN), with the designed information propagation\narchitecture, we can achieve effective supervision for stair geometric feature\nlearning by depth information. In addition, to complete the stair modeling, we\ntake the convex lines, concave lines, tread surfaces and riser surfaces as\nstair geometric features and apply Gaussian kernels to enable the network to\npredict contextual information within the stair lines. Combined with the depth\ninformation obtained by depth sensors, we propose a stair point cloud\nreconstruction method that can quickly get point clouds belonging to the stair\nstep surfaces. Experiments on our dataset show that our method has a\nsignificant improvement over the previous best monocular vision method, with an\nintersection over union (IOU) increase of 3.4 %, and the lightweight version\nhas a fast detection speed and can meet the requirements of most real-time\napplications. Our dataset is available at\nhttps://data.mendeley.com/datasets/6kffmjt7g2/1.\n","authors":["Chen Wang","Zhongcai Pei","Shuang Qiu","Yachun Wang","Zhiyong Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06713v1","updated":"2023-08-13T08:06:18Z","published":"2023-08-13T08:06:18Z","title":"LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts","summary":"  Thanks to the rapid development of diffusion models, unprecedented progress\nhas been witnessed in image synthesis. Prior works mostly rely on pre-trained\nlinguistic models, but a text is often too abstract to properly specify all the\nspatial properties of an image, e.g., the layout configuration of a scene,\nleading to the sub-optimal results of complex scene generation. In this paper,\nwe achieve accurate complex scene generation by proposing a semantically\ncontrollable Layout-AWare diffusion model, termed LAW-Diffusion. Distinct from\nthe previous Layout-to-Image generation (L2I) methods that only explore\ncategory-aware relationships, LAW-Diffusion introduces a spatial dependency\nparser to encode the location-aware semantic coherence across objects as a\nlayout embedding and produces a scene with perceptually harmonious object\nstyles and contextual relations. To be specific, we delicately instantiate each\nobject's regional semantics as an object region map and leverage a\nlocation-aware cross-object attention module to capture the spatial\ndependencies among those disentangled representations. We further propose an\nadaptive guidance schedule for our layout guidance to mitigate the trade-off\nbetween the regional semantic alignment and the texture fidelity of generated\nobjects. Moreover, LAW-Diffusion allows for instance reconfiguration while\nmaintaining the other regions in a synthesized image by introducing a\nlayout-aware latent grafting mechanism to recompose its local regional\nsemantics. To better verify the plausibility of generated scenes, we propose a\nnew evaluation metric for the L2I task, dubbed Scene Relation Score (SRS) to\nmeasure how the images preserve the rational and harmonious relations among\ncontextual objects. Comprehensive experiments demonstrate that our\nLAW-Diffusion yields the state-of-the-art generative performance, especially\nwith coherent object relations.\n","authors":["Binbin Yang","Yi Luo","Ziliang Chen","Guangrun Wang","Xiaodan Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2308.06713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06712v1","updated":"2023-08-13T08:02:14Z","published":"2023-08-13T08:02:14Z","title":"Compositional Feature Augmentation for Unbiased Scene Graph Generation","summary":"  Scene Graph Generation (SGG) aims to detect all the visual relation triplets\n<sub, pred, obj> in a given image. With the emergence of various advanced\ntechniques for better utilizing both the intrinsic and extrinsic information in\neach relation triplet, SGG has achieved great progress over the recent years.\nHowever, due to the ubiquitous long-tailed predicate distributions, today's SGG\nmodels are still easily biased to the head predicates. Currently, the most\nprevalent debiasing solutions for SGG are re-balancing methods, e.g., changing\nthe distributions of original training samples. In this paper, we argue that\nall existing re-balancing strategies fail to increase the diversity of the\nrelation triplet features of each predicate, which is critical for robust SGG.\nTo this end, we propose a novel Compositional Feature Augmentation (CFA)\nstrategy, which is the first unbiased SGG work to mitigate the bias issue from\nthe perspective of increasing the diversity of triplet features. Specifically,\nwe first decompose each relation triplet feature into two components: intrinsic\nfeature and extrinsic feature, which correspond to the intrinsic\ncharacteristics and extrinsic contexts of a relation triplet, respectively.\nThen, we design two different feature augmentation modules to enrich the\nfeature diversity of original relation triplets by replacing or mixing up\neither their intrinsic or extrinsic features from other samples. Due to its\nmodel-agnostic nature, CFA can be seamlessly incorporated into various SGG\nframeworks. Extensive ablations have shown that CFA achieves a new\nstate-of-the-art performance on the trade-off between different metrics.\n","authors":["Lin Li","Guikun Chen","Jun Xiao","Yi Yang","Chunping Wang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06712v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06707v1","updated":"2023-08-13T07:51:59Z","published":"2023-08-13T07:51:59Z","title":"Condition-Adaptive Graph Convolution Learning for Skeleton-Based Gait\n  Recognition","summary":"  Graph convolutional networks have been widely applied in skeleton-based gait\nrecognition. A key challenge in this task is to distinguish the individual\nwalking styles of different subjects across various views. Existing\nstate-of-the-art methods employ uniform convolutions to extract features from\ndiverse sequences and ignore the effects of viewpoint changes. To overcome\nthese limitations, we propose a condition-adaptive graph (CAG) convolution\nnetwork that can dynamically adapt to the specific attributes of each skeleton\nsequence and the corresponding view angle. In contrast to using fixed weights\nfor all joints and sequences, we introduce a joint-specific filter learning\n(JSFL) module in the CAG method, which produces sequence-adaptive filters at\nthe joint level. The adaptive filters capture fine-grained patterns that are\nunique to each joint, enabling the extraction of diverse spatial-temporal\ninformation about body parts. Additionally, we design a view-adaptive topology\nlearning (VATL) module that generates adaptive graph topologies. These graph\ntopologies are used to correlate the joints adaptively according to the\nspecific view conditions. Thus, CAG can simultaneously adjust to various\nwalking styles and viewpoints. Experiments on the two most widely used datasets\n(i.e., CASIA-B and OU-MVLP) show that CAG surpasses all previous skeleton-based\nmethods. Moreover, the recognition performance can be enhanced by simply\ncombining CAG with appearance-based methods, demonstrating the ability of CAG\nto provide useful complementary information.The source code will be available\nat https://github.com/OliverHxh/CAG.\n","authors":["Xiaohu Huang","Xinggang Wang","Zhidianqiu Jin","Bo Yang","Botao He","Bin Feng","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06707v1.pdf","comment":"Accepted by TIP journal"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.04661v2","updated":"2023-08-13T03:30:28Z","published":"2023-08-09T01:58:28Z","title":"Unified Matrix Factorization with Dynamic Multi-view Clustering","summary":"  Matrix factorization (MF) is a classical collaborative filtering algorithm\nfor recommender systems. It decomposes the user-item interaction matrix into a\nproduct of low-dimensional user representation matrix and item representation\nmatrix. In typical recommendation scenarios, the user-item interaction paradigm\nis usually a two-stage process and requires static clustering analysis of the\nobtained user and item representations. The above process, however, is time and\ncomputationally intensive, making it difficult to apply in real-time to\ne-commerce or Internet of Things environments with billions of users and\ntrillions of items. To address this, we propose a unified matrix factorization\nmethod based on dynamic multi-view clustering (MFDMC) that employs an\nend-to-end training paradigm. Specifically, in each view, a user/item\nrepresentation is regarded as a weighted projection of all clusters. The\nrepresentation of each cluster is learnable, enabling the dynamic discarding of\nbad clusters. Furthermore, we employ multi-view clustering to represent\nmultiple roles of users/items, effectively utilizing the representation space\nand improving the interpretability of the user/item representations for\ndownstream tasks. Extensive experiments show that our proposed MFDMC achieves\nstate-of-the-art performance on real-world recommendation datasets.\nAdditionally, comprehensive visualization and ablation studies interpretably\nconfirm that our method provides meaningful representations for downstream\ntasks of users/items.\n","authors":["Shangde Gao","Ke Liu","Yichao Fu"],"pdf_url":"https://arxiv.org/pdf/2308.04661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08068v2","updated":"2023-08-13T03:04:08Z","published":"2023-02-16T04:06:25Z","title":"LabelPrompt: Effective Prompt-based Learning for Relation Classification","summary":"  Recently, prompt-based learning has gained popularity across many natural\nlanguage processing (NLP) tasks by reformulating them into a cloze-style format\nto better align pre-trained language models (PLMs) with downstream tasks.\nHowever, applying this approach to relation classification poses unique\nchallenges. Specifically, associating natural language words that fill the\nmasked token with semantic relation labels (\\textit{e.g.}\n\\textit{``org:founded\\_by}'') is difficult. To address this challenge, this\npaper presents a novel prompt-based learning method, namely LabelPrompt, for\nthe relation classification task. Motivated by the intuition to ``GIVE MODEL\nCHOICES!'', we first define additional tokens to represent relation labels,\nwhich regard these tokens as the verbaliser with semantic initialisation and\nexplicitly construct them with a prompt template method. Then, to mitigate\ninconsistency between predicted relations and given entities, we implement an\nentity-aware module with contrastive learning. Last, we conduct an attention\nquery strategy within the self-attention layer to differentiates prompt tokens\nand sequence tokens. Together, these strategies enhance the adaptability of\nprompt-based learning, especially when only small labelled datasets is\navailable. Comprehensive experiments on benchmark datasets demonstrate the\nsuperiority of our method, particularly in the few-shot scenario.\n","authors":["Wenjie Zhang","Xiaoning Song","Zhenhua Feng","Tianyang Xu","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2302.08068v2.pdf","comment":"20 pages, 5 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.06862v1","updated":"2023-08-13T23:34:36Z","published":"2023-08-13T23:34:36Z","title":"Effect of Choosing Loss Function when Using T-batching for\n  Representation Learning on Dynamic Networks","summary":"  Representation learning methods have revolutionized machine learning on\nnetworks by converting discrete network structures into continuous domains.\nHowever, dynamic networks that evolve over time pose new challenges. To address\nthis, dynamic representation learning methods have gained attention, offering\nbenefits like reduced learning time and improved accuracy by utilizing temporal\ninformation.\n  T-batching is a valuable technique for training dynamic network models that\nreduces training time while preserving vital conditions for accurate modeling.\nHowever, we have identified a limitation in the training loss function used\nwith t-batching. Through mathematical analysis, we propose two alternative loss\nfunctions that overcome these issues, resulting in enhanced training\nperformance.\n  We extensively evaluate the proposed loss functions on synthetic and\nreal-world dynamic networks. The results consistently demonstrate superior\nperformance compared to the original loss function. Notably, in a real-world\nnetwork characterized by diverse user interaction histories, the proposed loss\nfunctions achieved more than 26.9% enhancement in Mean Reciprocal Rank (MRR)\nand more than 11.8% improvement in Recall@10. These findings underscore the\nefficacy of the proposed loss functions in dynamic network modeling.\n","authors":["Erfan Loghmani","MohammadAmin Fazli"],"pdf_url":"https://arxiv.org/pdf/2308.06862v1.pdf","comment":"29 pages, 10 figures, 4 tables, Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2205.10952v2","updated":"2023-08-13T23:17:26Z","published":"2022-05-22T23:14:27Z","title":"Analysis of functional neural codes of deep learning models","summary":"  Deep neural networks (DNNs), the agents of deep learning (DL), require a\nmassive number of parallel/sequential operations. This makes it difficult to\ncomprehend DNNs' operations and impedes proper diagnosis. Without better\nknowledge of their internal process, deploying DNNs in high-stakes domains can\nlead to catastrophic failures. Therefore, to build more reliable DNNs/DL to be\ndeployed in high-stakes real-world problems, it is imperative that we gain\ninsights into DNNs' internal operations underlying their decision-making. Here,\nwe use the self-organizing map (SOM) to analyze DL models' internal codes\nassociated with DNNs' decision-making. Our analyses suggest that shallow layers\nclose to the input layer compress features into condensed space and that deep\nlayers close to the output layer expand feature space. We also found evidence\nindicating that compressed features may underlie DNNs' vulnerabilities to\nadversarial perturbations.\n","authors":["Jung Hoon Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2205.10952v2.pdf","comment":"13 pages, 8 main figures, 3 supplemental figures, 3 supplemental\n  tables"},{"id":"http://arxiv.org/abs/2306.12086v2","updated":"2023-08-13T22:59:19Z","published":"2023-06-21T08:05:05Z","title":"What Constitutes Good Contrastive Learning in Time-Series Forecasting?","summary":"  In recent years, the introduction of self-supervised contrastive learning\n(SSCL) has demonstrated remarkable improvements in representation learning\nacross various domains, including natural language processing and computer\nvision. By leveraging the inherent benefits of self-supervision, SSCL enables\nthe pre-training of representation models using vast amounts of unlabeled data.\nDespite these advances, there remains a significant gap in understanding the\nimpact of different SSCL strategies on time series forecasting performance, as\nwell as the specific benefits that SSCL can bring. This paper aims to address\nthese gaps by conducting a comprehensive analysis of the effectiveness of\nvarious training variables, including different SSCL algorithms, learning\nstrategies, model architectures, and their interplay. Additionally, to gain\ndeeper insights into the improvements brought about by SSCL in the context of\ntime-series forecasting, a qualitative analysis of the empirical receptive\nfield is performed. Through our experiments, we demonstrate that the end-to-end\ntraining of a Transformer model using the Mean Squared Error (MSE) loss and\nSSCL emerges as the most effective approach in time series forecasting.\nNotably, the incorporation of the contrastive objective enables the model to\nprioritize more pertinent information for forecasting, such as scale and\nperiodic relationships. These findings contribute to a better understanding of\nthe benefits of SSCL in time series forecasting and provide valuable insights\nfor future research in this area. Our codes are available at\nhttps://github.com/chiyuzhang94/contrastive_learning_time-series_e2e.\n","authors":["Chiyu Zhang","Qi Yan","Lili Meng","Tristan Sylvain"],"pdf_url":"https://arxiv.org/pdf/2306.12086v2.pdf","comment":"Accepted at IJCAI'22 Workshop-AI4TS: AI for Time Series Analysis"},{"id":"http://arxiv.org/abs/2308.06851v1","updated":"2023-08-13T22:03:35Z","published":"2023-08-13T22:03:35Z","title":"Optimizing Offensive Gameplan in the National Basketball Association\n  with Machine Learning","summary":"  Throughout the analytical revolution that has occurred in the NBA, the\ndevelopment of specific metrics and formulas has given teams, coaches, and\nplayers a new way to see the game. However - the question arises - how can we\nverify any metrics? One method would simply be eyeball approximation (trying\nout many different gameplans) and/or trial and error - an estimation-based and\ncostly approach. Another approach is to try to model already existing metrics\nwith a unique set of features using machine learning techniques. The key to\nthis approach is that with these features that are selected, we can try to\ngauge the effectiveness of these features combined, rather than using\nindividual analysis in simple metric evaluation. If we have an accurate model,\nit can particularly help us determine the specifics of gameplan execution. In\nthis paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was\nfound to have a correlation with different NBA playtypes using both a linear\nregression model and a neural network regression model, although ultimately, a\nneural network worked slightly better than linear regression. Using the\naccuracy of the models as a justification, the next step was to optimize the\noutput of the model with test examples, which would demonstrate the combination\nof features to best achieve a highly functioning offense.\n","authors":["Eamon Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.06851v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.06849v1","updated":"2023-08-13T21:42:31Z","published":"2023-08-13T21:42:31Z","title":"When Monte-Carlo Dropout Meets Multi-Exit: Optimizing Bayesian Neural\n  Networks on FPGA","summary":"  Bayesian Neural Networks (BayesNNs) have demonstrated their capability of\nproviding calibrated prediction for safety-critical applications such as\nmedical imaging and autonomous driving. However, the high algorithmic\ncomplexity and the poor hardware performance of BayesNNs hinder their\ndeployment in real-life applications. To bridge this gap, this paper proposes a\nnovel multi-exit Monte-Carlo Dropout (MCD)-based BayesNN that achieves\nwell-calibrated predictions with low algorithmic complexity. To further reduce\nthe barrier to adopting BayesNNs, we propose a transformation framework that\ncan generate FPGA-based accelerators for multi-exit MCD-based BayesNNs. Several\nnovel optimization techniques are introduced to improve hardware performance.\nOur experiments demonstrate that our auto-generated accelerator achieves higher\nenergy efficiency than CPU, GPU, and other state-of-the-art hardware\nimplementations.\n","authors":["Hongxiang Fan","Hao Chen","Liam Castelli","Zhiqiang Que","He Li","Kenneth Long","Wayne Luk"],"pdf_url":"https://arxiv.org/pdf/2308.06849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05424v7","updated":"2023-08-13T21:33:26Z","published":"2022-08-08T16:54:01Z","title":"Hard-Constrained Deep Learning for Climate Downscaling","summary":"  The availability of reliable, high-resolution climate and weather data is\nimportant to inform long-term decisions on climate adaptation and mitigation\nand to guide rapid responses to extreme events. Forecasting models are limited\nby computational costs and, therefore, often generate coarse-resolution\npredictions. Statistical downscaling, including super-resolution methods from\ndeep learning, can provide an efficient method of upsampling low-resolution\ndata. However, despite achieving visually compelling results in some cases,\nsuch models frequently violate conservation laws when predicting physical\nvariables. In order to conserve physical quantities, here we introduce methods\nthat guarantee statistical constraints are satisfied by a deep learning\ndownscaling model while also improving their performance according to\ntraditional metrics. We compare different constraining approaches and\ndemonstrate their applicability across different neural architectures as well\nas a variety of climate and weather datasets. Besides enabling faster and more\naccurate climate predictions through downscaling, we also show that our novel\nmethodologies can improve super-resolution for satellite data and standard\ndatasets.\n","authors":["Paula Harder","Alex Hernandez-Garcia","Venkatesh Ramesh","Qidong Yang","Prasanna Sattigeri","Daniela Szwarcman","Campbell Watson","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2208.05424v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08602v2","updated":"2023-08-13T20:36:46Z","published":"2023-07-13T21:51:29Z","title":"CaRT: Certified Safety and Robust Tracking in Learning-based Motion\n  Planning for Multi-Agent Systems","summary":"  The key innovation of our analytical method, CaRT, lies in establishing a new\nhierarchical, distributed architecture to guarantee the safety and robustness\nof a given learning-based motion planning policy. First, in a nominal setting,\nthe analytical form of our CaRT safety filter formally ensures safe maneuvers\nof nonlinear multi-agent systems, optimally with minimal deviation from the\nlearning-based policy. Second, in off-nominal settings, the analytical form of\nour CaRT robust filter optimally tracks the certified safe trajectory,\ngenerated by the previous layer in the hierarchy, the CaRT safety filter. We\nshow using contraction theory that CaRT guarantees safety and the exponential\nboundedness of the trajectory tracking error, even under the presence of\ndeterministic and stochastic disturbance. Also, the hierarchical nature of CaRT\nenables enhancing its robustness for safety just by its superior tracking to\nthe certified safe trajectory, thereby making it suitable for off-nominal\nscenarios with large disturbances. This is a major distinction from\nconventional safety function-driven approaches, where the robustness originates\nfrom the stability of a safe set, which could pull the system\nover-conservatively to the interior of the safe set. Our log-barrier\nformulation in CaRT allows for its distributed implementation in multi-agent\nsettings. We demonstrate the effectiveness of CaRT in several examples of\nnonlinear motion planning and control problems, including optimal,\nmulti-spacecraft reconfiguration.\n","authors":["Hiroyasu Tsukamoto","Benjamin Rivière","Changrak Choi","Amir Rahmani","Soon-Jo Chung"],"pdf_url":"https://arxiv.org/pdf/2307.08602v2.pdf","comment":"IEEE Conference on Decision and Control (CDC), Preprint Version,\n  Accepted July, 2023"},{"id":"http://arxiv.org/abs/2211.03226v2","updated":"2023-08-13T19:58:52Z","published":"2022-11-06T22:05:27Z","title":"Rotation-equivariant Graph Neural Networks for Learning Glassy Liquids\n  Representations","summary":"  Within the glassy liquids community, the use of Machine Learning (ML) to\nmodel particles' static structure is currently a hot topic. The state of the\nart consists in Graph Neural Networks (GNNs), which have a great expressive\npower but are heavy models with numerous parameters and lack interpretability.\nInspired by recent advances in the field of Machine Learning group-equivariant\nrepresentations, we build a GNN that learns a robust representation of the\nglass' static structure by constraining it to preserve the roto-translation\n(SE(3)) equivariance. We show that this constraint not only significantly\nimproves the predictive power but also improves the ability to generalize to\nunseen temperatures while allowing to reduce the number of parameters.\nFurthermore, interpretability is improved, as we can relate the action of our\nbasic convolution layer to well-known rotation-invariant expert features.\nThrough transfer-learning experiments we demonstrate that our network learns a\nrobust representation, which allows us to push forward the idea of a learned\nglass structural order parameter.\n","authors":["Francesco Saverio Pezzicoli","Guillaume Charpiat","François P. Landes"],"pdf_url":"https://arxiv.org/pdf/2211.03226v2.pdf","comment":"15 pages, 9 figures plus references and appendix"},{"id":"http://arxiv.org/abs/2308.06838v1","updated":"2023-08-13T19:45:20Z","published":"2023-08-13T19:45:20Z","title":"Generalizing Topological Graph Neural Networks with Paths","summary":"  While Graph Neural Networks (GNNs) have made significant strides in diverse\nareas, they are hindered by a theoretical constraint known as the\n1-Weisfeiler-Lehmann test. Even though latest advancements in higher-order GNNs\ncan overcome this boundary, they typically center around certain graph\ncomponents like cliques or cycles. However, our investigation goes a different\nroute. We put emphasis on paths, which are inherent in every graph. We are able\nto construct a more general topological perspective and form a bridge to\ncertain established theories about other topological domains. Interestingly,\nwithout any assumptions on graph sub-structures, our approach surpasses earlier\ntechniques in this field, achieving state-of-the-art performance on several\nbenchmarks.\n","authors":["Quang Truong","Peter Chin"],"pdf_url":"https://arxiv.org/pdf/2308.06838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03821v2","updated":"2023-08-13T18:27:52Z","published":"2022-10-07T21:18:22Z","title":"Large Language Models can Implement Policy Iteration","summary":"  This work presents In-Context Policy Iteration, an algorithm for performing\nReinforcement Learning (RL), in-context, using foundation models. While the\napplication of foundation models to RL has received considerable attention,\nmost approaches rely on either (1) the curation of expert demonstrations\n(either through manual design or task-specific pretraining) or (2) adaptation\nto the task of interest using gradient methods (either fine-tuning or training\nof adapter layers). Both of these techniques have drawbacks. Collecting\ndemonstrations is labor-intensive, and algorithms that rely on them do not\noutperform the experts from which the demonstrations were derived. All gradient\ntechniques are inherently slow, sacrificing the \"few-shot\" quality that made\nin-context learning attractive to begin with. In this work, we present an\nalgorithm, ICPI, that learns to perform RL tasks without expert demonstrations\nor gradients. Instead we present a policy-iteration method in which the prompt\ncontent is the entire locus of learning. ICPI iteratively updates the contents\nof the prompt from which it derives its policy through trial-and-error\ninteraction with an RL environment. In order to eliminate the role of\nin-weights learning (on which approaches like Decision Transformer rely\nheavily), we demonstrate our algorithm using Codex, a language model with no\nprior knowledge of the domains on which we evaluate it.\n","authors":["Ethan Brooks","Logan Walls","Richard L. Lewis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2210.03821v2.pdf","comment":"10 pages, 4 figures, submitted to ICLR 2023"},{"id":"http://arxiv.org/abs/2308.06828v1","updated":"2023-08-13T18:14:10Z","published":"2023-08-13T18:14:10Z","title":"An Ensemble Approach to Question Classification: Integrating Electra\n  Transformer, GloVe, and LSTM","summary":"  This paper introduces a novel ensemble approach for question classification\nusing state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model\nis trained and evaluated on the TREC dataset, a well-established benchmark for\nquestion classification tasks. The ensemble model combines the strengths of\nElectra, a transformer-based model for language understanding, GloVe, a global\nvectors for word representation, and LSTM, a recurrent neural network variant,\nproviding a robust and efficient solution for question classification.\nExtensive experiments were carried out to compare the performance of the\nproposed ensemble approach with other cutting-edge models, such as BERT,\nRoBERTa, and DistilBERT. Our results demonstrate that the ensemble model\noutperforms these models across all evaluation metrics, achieving an accuracy\nof 0.8 on the test set. These findings underscore the effectiveness of the\nensemble approach in enhancing the performance of question classification\ntasks, and invite further exploration of ensemble methods in natural language\nprocessing.\n","authors":["Sanad Aburass","Osama Dorgham"],"pdf_url":"https://arxiv.org/pdf/2308.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06827v1","updated":"2023-08-13T18:12:28Z","published":"2023-08-13T18:12:28Z","title":"Reinforcement Graph Clustering with Unknown Cluster Number","summary":"  Deep graph clustering, which aims to group nodes into disjoint clusters by\nneural networks in an unsupervised manner, has attracted great attention in\nrecent years. Although the performance has been largely improved, the excellent\nperformance of the existing methods heavily relies on an accurately predefined\ncluster number, which is not always available in the real-world scenario. To\nenable the deep graph clustering algorithms to work without the guidance of the\npredefined cluster number, we propose a new deep graph clustering method termed\nReinforcement Graph Clustering (RGC). In our proposed method, cluster number\ndetermination and unsupervised representation learning are unified into a\nuniform framework by the reinforcement learning mechanism. Concretely, the\ndiscriminative node representations are first learned with the contrastive\npretext task. Then, to capture the clustering state accurately with both local\nand global information in the graph, both node and cluster states are\nconsidered. Subsequently, at each state, the qualities of different cluster\nnumbers are evaluated by the quality network, and the greedy action is executed\nto determine the cluster number. In order to conduct feedback actions, the\nclustering-oriented reward function is proposed to enhance the cohesion of the\nsame clusters and separate the different clusters. Extensive experiments\ndemonstrate the effectiveness and efficiency of our proposed method. The source\ncode of RGC is shared at https://github.com/yueliu1999/RGC and a collection\n(papers, codes and, datasets) of deep graph clustering is shared at\nhttps://github.com/yueliu1999/Awesome-Deep-Graph-Clustering on Github.\n","authors":["Yue Liu","Ke Liang","Jun Xia","Xihong Yang","Sihang Zhou","Meng Liu","Xinwang Liu","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2308.06827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08106v2","updated":"2023-08-13T18:07:47Z","published":"2022-10-14T21:02:04Z","title":"A Primal-Dual Algorithm for Hybrid Federated Learning","summary":"  Very few methods for hybrid federated learning, where clients only hold\nsubsets of both features and samples, exist. Yet, this scenario is very\nimportant in practical settings. We provide a fast, robust algorithm for hybrid\nfederated learning that hinges on Fenchel Duality. We prove the convergence of\nthe algorithm to the same solution as if the model was trained centrally in a\nvariety of practical regimes. Furthermore, we provide experimental results that\ndemonstrate the performance improvements of the algorithm over a commonly used\nmethod in federated learning, FedAvg. We also provide privacy considerations\nand necessary steps to protect client data.\n","authors":["Tom Overman","Garrett Blum","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2210.08106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.10945v2","updated":"2023-08-13T18:00:10Z","published":"2022-01-26T14:01:32Z","title":"On the Power of Gradual Network Alignment Using Dual-Perception\n  Similarities","summary":"  Network alignment (NA) is the task of finding the correspondence of nodes\nbetween two networks based on the network structure and node attributes. Our\nstudy is motivated by the fact that, since most of existing NA methods have\nattempted to discover all node pairs at once, they do not harness information\nenriched through interim discovery of node correspondences to more accurately\nfind the next correspondences during the node matching. To tackle this\nchallenge, we propose Grad-Align, a new NA method that gradually discovers node\npairs by making full use of node pairs exhibiting strong consistency, which are\neasy to be discovered in the early stage of gradual matching. Specifically,\nGrad-Align first generates node embeddings of the two networks based on graph\nneural networks along with our layer-wise reconstruction loss, a loss built\nupon capturing the first-order and higher-order neighborhood structures. Then,\nnodes are gradually aligned by computing dual-perception similarity measures\nincluding the multi-layer embedding similarity as well as the Tversky\nsimilarity, an asymmetric set similarity using the Tversky index applicable to\nnetworks with different scales. Additionally, we incorporate an edge\naugmentation module into Grad-Align to reinforce the structural consistency.\nThrough comprehensive experiments using real-world and synthetic datasets, we\nempirically demonstrate that Grad-Align consistently outperforms\nstate-of-the-art NA methods.\n","authors":["Jin-Duk Park","Cong Tran","Won-Yong Shin","Xin Cao"],"pdf_url":"https://arxiv.org/pdf/2201.10945v2.pdf","comment":"16 pages, 11 figures, 4 tables; 13 pages, to appear in the IEEE\n  Transactions on Pattern Analysis and Machine Intelligence (Please cite our\n  journal version that will appear in an upcoming issue.)"},{"id":"http://arxiv.org/abs/2308.06822v1","updated":"2023-08-13T17:40:56Z","published":"2023-08-13T17:40:56Z","title":"Approximate and Weighted Data Reconstruction Attack in Federated\n  Learning","summary":"  Federated Learning (FL) is a distributed learning paradigm that enables\nmultiple clients to collaborate on building a machine learning model without\nsharing their private data. Although FL is considered privacy-preserved by\ndesign, recent data reconstruction attacks demonstrate that an attacker can\nrecover clients' training data based on the parameters shared in FL. However,\nmost existing methods fail to attack the most widely used horizontal Federated\nAveraging (FedAvg) scenario, where clients share model parameters after\nmultiple local training steps. To tackle this issue, we propose an\ninterpolation-based approximation method, which makes attacking FedAvg\nscenarios feasible by generating the intermediate model updates of the clients'\nlocal training processes. Then, we design a layer-wise weighted loss function\nto improve the data quality of reconstruction. We assign different weights to\nmodel updates in different layers concerning the neural network structure, with\nthe weights tuned by Bayesian optimization. Finally, experimental results\nvalidate the superiority of our proposed approximate and weighted attack (AWA)\nmethod over the other state-of-the-art methods, as demonstrated by the\nsubstantial improvement in different evaluation metrics for image data\nreconstructions.\n","authors":["Ziqi Wang","Yongcun Song","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2308.06822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06819v1","updated":"2023-08-13T17:23:36Z","published":"2023-08-13T17:23:36Z","title":"SoK: Realistic Adversarial Attacks and Defenses for Intelligent Network\n  Intrusion Detection","summary":"  Machine Learning (ML) can be incredibly valuable to automate anomaly\ndetection and cyber-attack classification, improving the way that Network\nIntrusion Detection (NID) is performed. However, despite the benefits of ML\nmodels, they are highly susceptible to adversarial cyber-attack examples\nspecifically crafted to exploit them. A wide range of adversarial attacks have\nbeen created and researchers have worked on various defense strategies to\nsafeguard ML models, but most were not intended for the specific constraints of\na communication network and its communication protocols, so they may lead to\nunrealistic examples in the NID domain. This Systematization of Knowledge (SoK)\nconsolidates and summarizes the state-of-the-art adversarial learning\napproaches that can generate realistic examples and could be used in real ML\ndevelopment and deployment scenarios with real network traffic flows. This SoK\nalso describes the open challenges regarding the use of adversarial ML in the\nNID domain, defines the fundamental properties that are required for an\nadversarial example to be realistic, and provides guidelines for researchers to\nensure that their future experiments are adequate for a real communication\nnetwork.\n","authors":["João Vitorino","Isabel Praça","Eva Maia"],"pdf_url":"https://arxiv.org/pdf/2308.06819v1.pdf","comment":"31 pages, 3 tables, 6 figures, Computers and Security journal"},{"id":"http://arxiv.org/abs/2306.02913v4","updated":"2023-08-13T16:09:33Z","published":"2023-06-05T14:19:52Z","title":"Decentralized SGD and Average-direction SAM are Asymptotically\n  Equivalent","summary":"  Decentralized stochastic gradient descent (D-SGD) allows collaborative\nlearning on massive devices simultaneously without the control of a central\nserver. However, existing theories claim that decentralization invariably\nundermines generalization. In this paper, we challenge the conventional belief\nand present a completely new perspective for understanding decentralized\nlearning. We prove that D-SGD implicitly minimizes the loss function of an\naverage-direction Sharpness-aware minimization (SAM) algorithm under general\nnon-convex non-$\\beta$-smooth settings. This surprising asymptotic equivalence\nreveals an intrinsic regularization-optimization trade-off and three advantages\nof decentralization: (1) there exists a free uncertainty evaluation mechanism\nin D-SGD to improve posterior estimation; (2) D-SGD exhibits a gradient\nsmoothing effect; and (3) the sharpness regularization effect of D-SGD does not\ndecrease as total batch size increases, which justifies the potential\ngeneralization benefit of D-SGD over centralized SGD (C-SGD) in large-batch\nscenarios.\n","authors":["Tongtian Zhu","Fengxiang He","Kaixuan Chen","Mingli Song","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2306.02913v4.pdf","comment":"Accepted for publication in the 40th International Conference on\n  Machine Learning (ICML 2023)"},{"id":"http://arxiv.org/abs/2308.06801v1","updated":"2023-08-13T16:04:03Z","published":"2023-08-13T16:04:03Z","title":"SAILOR: Structural Augmentation Based Tail Node Representation Learning","summary":"  Graph Neural Networks (GNNs) have achieved state-of-the-art performance in\nrepresentation learning for graphs recently. However, the effectiveness of\nGNNs, which capitalize on the key operation of message propagation, highly\ndepends on the quality of the topology structure. Most of the graphs in\nreal-world scenarios follow a long-tailed distribution on their node degrees,\nthat is, a vast majority of the nodes in the graph are tail nodes with only a\nfew connected edges. GNNs produce inferior node representations for tail nodes\nsince they lack structural information. In the pursuit of promoting the\nexpressiveness of GNNs for tail nodes, we explore how the deficiency of\nstructural information deteriorates the performance of tail nodes and propose a\ngeneral Structural Augmentation based taIL nOde Representation learning\nframework, dubbed as SAILOR, which can jointly learn to augment the graph\nstructure and extract more informative representations for tail nodes.\nExtensive experiments on public benchmark datasets demonstrate that SAILOR can\nsignificantly improve the tail node representations and outperform the\nstate-of-the-art baselines.\n","authors":["Jie Liao","Jintang Li","Liang Chen","Bingzhe Wu","Yatao Bian","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.06801v1.pdf","comment":"Accepted by CIKM 2023; Code is available at\n  https://github.com/Jie-Re/SAILO"},{"id":"http://arxiv.org/abs/2308.06795v1","updated":"2023-08-13T15:44:39Z","published":"2023-08-13T15:44:39Z","title":"Faithful to Whom? Questioning Interpretability Measures in NLP","summary":"  A common approach to quantifying model interpretability is to calculate\nfaithfulness metrics based on iteratively masking input tokens and measuring\nhow much the predicted label changes as a result. However, we show that such\nmetrics are generally not suitable for comparing the interpretability of\ndifferent neural text classifiers as the response to masked inputs is highly\nmodel-specific. We demonstrate that iterative masking can produce large\nvariation in faithfulness scores between comparable models, and show that\nmasked samples are frequently outside the distribution seen during training. We\nfurther investigate the impact of adversarial attacks and adversarial training\non faithfulness scores, and demonstrate the relevance of faithfulness measures\nfor analyzing feature salience in text adversarial attacks. Our findings\nprovide new insights into the limitations of current faithfulness metrics and\nkey considerations to utilize them appropriately.\n","authors":["Evan Crothers","Herna Viktor","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2308.06795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11077v2","updated":"2023-08-13T15:23:43Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":"  The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v2.pdf","comment":"Camera Ready Version on ICCV 2023. Code and Models are publicly\n  available. Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2201.04604v4","updated":"2023-08-13T14:29:00Z","published":"2022-01-12T18:00:29Z","title":"Fine-grained Graph Learning for Multi-view Subspace Clustering","summary":"  Multi-view subspace clustering (MSC) is a popular unsupervised method by\nintegrating heterogeneous information to reveal the intrinsic clustering\nstructure hidden across views. Usually, MSC methods use graphs (or affinity\nmatrices) fusion to learn a common structure, and further apply graph-based\napproaches to clustering. Despite progress, most of the methods do not\nestablish the connection between graph learning and clustering. Meanwhile,\nconventional graph fusion strategies assign coarse-grained weights to combine\nmulti-graph, ignoring the importance of local structure. In this paper, we\npropose a fine-grained graph learning framework for multi-view subspace\nclustering (FGL-MSC) to address these issues. To utilize the multi-view\ninformation sufficiently, we design a specific graph learning method by\nintroducing graph regularization and a local structure fusion pattern. The main\nchallenge is how to optimize the fine-grained fusion weights while generating\nthe learned graph that fits the clustering task, thus making the clustering\nrepresentation meaningful and competitive. Accordingly, an iterative algorithm\nis proposed to solve the above joint optimization problem, which obtains the\nlearned graph, the clustering representation, and the fusion weights\nsimultaneously. Extensive experiments on eight real-world datasets show that\nthe proposed framework has comparable performance to the state-of-the-art\nmethods. The source code of the proposed method is available at\nhttps://github.com/siriuslay/FGL-MSC.\n","authors":["Yidi Wang","Xiaobing Pei","Haoxi Zhan"],"pdf_url":"https://arxiv.org/pdf/2201.04604v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06780v1","updated":"2023-08-13T14:25:54Z","published":"2023-08-13T14:25:54Z","title":"Neural Networks at a Fraction with Pruned Quaternions","summary":"  Contemporary state-of-the-art neural networks have increasingly large numbers\nof parameters, which prevents their deployment on devices with limited\ncomputational power. Pruning is one technique to remove unnecessary weights and\nreduce resource requirements for training and inference. In addition, for ML\ntasks where the input data is multi-dimensional, using higher-dimensional data\nembeddings such as complex numbers or quaternions has been shown to reduce the\nparameter count while maintaining accuracy. In this work, we conduct pruning on\nreal and quaternion-valued implementations of different architectures on\nclassification tasks. We find that for some architectures, at very high\nsparsity levels, quaternion models provide higher accuracies than their real\ncounterparts. For example, at the task of image classification on CIFAR-10\nusing Conv-4, at $3\\%$ of the number of parameters as the original model, the\npruned quaternion version outperforms the pruned real by more than $10\\%$.\nExperiments on various network architectures and datasets show that for\ndeployment in extremely resource-constrained environments, a sparse quaternion\nnetwork might be a better candidate than a real sparse model of similar\narchitecture.\n","authors":["Sahel Mohammad Iqbal","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.06780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06767v1","updated":"2023-08-13T13:34:04Z","published":"2023-08-13T13:34:04Z","title":"A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,\n  and Recommendations","summary":"  Modern deep neural networks, particularly recent large language models, come\nwith massive model sizes that require significant computational and storage\nresources. To enable the deployment of modern models on resource-constrained\nenvironments and accelerate inference time, researchers have increasingly\nexplored pruning techniques as a popular research direction in neural network\ncompression. However, there is a dearth of up-to-date comprehensive review\npapers on pruning. To address this issue, in this survey, we provide a\ncomprehensive review of existing research works on deep neural network pruning\nin a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to\nprune, and 4) fusion of pruning and other compression techniques. We then\nprovide a thorough comparative analysis of seven pairs of contrast settings for\npruning (e.g., unstructured/structured) and explore emerging topics, including\npost-training pruning, different levels of supervision for pruning, and broader\napplications (e.g., adversarial robustness) to shed light on the commonalities\nand differences of existing methods and lay the foundation for further method\ndevelopment. To facilitate future research, we build a curated collection of\ndatasets, networks, and evaluations on different applications. Finally, we\nprovide some valuable recommendations on selecting pruning methods and prospect\npromising research directions. We build a repository at\nhttps://github.com/hrcheng1066/awesome-pruning.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06764v1","updated":"2023-08-13T13:01:21Z","published":"2023-08-13T13:01:21Z","title":"Few-shot Class-incremental Learning: A Survey","summary":"  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in\nmachine learning, as it necessitates the continuous learning of new classes\nfrom sparse labeled training samples without forgetting previous knowledge.\nWhile this field has seen recent progress, it remains an active area of\nexploration. This paper aims to provide a comprehensive and systematic review\nof FSCIL. In our in-depth examination, we delve into various facets of FSCIL,\nencompassing the problem definition, the discussion of primary challenges of\nunreliable empirical risk minimization and the stability-plasticity dilemma,\ngeneral schemes, and relevant problems of incremental learning and few-shot\nlearning. Besides, we offer an overview of benchmark datasets and evaluation\nmetrics. Furthermore, we introduce the classification methods in FSCIL from\ndata-based, structure-based, and optimization-based approaches and the object\ndetection methods in FSCIL from anchor-free and anchor-based approaches. Beyond\nthese, we illuminate several promising research directions within FSCIL that\nmerit further investigation.\n","authors":["Jinghua Zhang","Li Liu","Olli Silven","Matti Pietikäinen","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2308.06764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06763v1","updated":"2023-08-13T13:00:50Z","published":"2023-08-13T13:00:50Z","title":"Discovering the Symptom Patterns of COVID-19 from Recovered and Deceased\n  Patients Using Apriori Association Rule Mining","summary":"  The COVID-19 pandemic has a devastating impact globally, claiming millions of\nlives and causing significant social and economic disruptions. In order to\noptimize decision-making and allocate limited resources, it is essential to\nidentify COVID-19 symptoms and determine the severity of each case. Machine\nlearning algorithms offer a potent tool in the medical field, particularly in\nmining clinical datasets for useful information and guiding scientific\ndecisions. Association rule mining is a machine learning technique for\nextracting hidden patterns from data. This paper presents an application of\nassociation rule mining based Apriori algorithm to discover symptom patterns\nfrom COVID-19 patients. The study, using 2875 records of patient, identified\nthe most common symptoms as apnea (72%), cough (64%), fever (59%), weakness\n(18%), myalgia (14.5%), and sore throat (12%). The proposed method provides\nclinicians with valuable insight into disease that can assist them in managing\nand treating it effectively.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast","Mobin Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2308.06763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02918v2","updated":"2023-08-13T13:00:00Z","published":"2023-08-05T16:31:32Z","title":"Spectral Ranking Inferences based on General Multiway Comparisons","summary":"  This paper studies the performance of the spectral method in the estimation\nand uncertainty quantification of the unobserved preference scores of compared\nentities in a very general and more realistic setup in which the comparison\ngraph consists of hyper-edges of possible heterogeneous sizes and the number of\ncomparisons can be as low as one for a given hyper-edge. Such a setting is\npervasive in real applications, circumventing the need to specify the graph\nrandomness and the restrictive homogeneous sampling assumption imposed in the\ncommonly-used Bradley-Terry-Luce (BTL) or Plackett-Luce (PL) models.\nFurthermore, in the scenarios when the BTL or PL models are appropriate, we\nunravel the relationship between the spectral estimator and the Maximum\nLikelihood Estimator (MLE). We discover that a two-step spectral method, where\nwe apply the optimal weighting estimated from the equal weighting vanilla\nspectral method, can achieve the same asymptotic efficiency as the MLE. Given\nthe asymptotic distributions of the estimated preference scores, we also\nintroduce a comprehensive framework to carry out both one-sample and two-sample\nranking inferences, applicable to both fixed and random graph settings. It is\nnoteworthy that it is the first time effective two-sample rank testing methods\nare proposed. Finally, we substantiate our findings via comprehensive numerical\nsimulations and subsequently apply our developed methodologies to perform\nstatistical inferences on statistics journals and movie rankings.\n","authors":["Jianqing Fan","Zhipeng Lou","Weichen Wang","Mengxin Yu"],"pdf_url":"https://arxiv.org/pdf/2308.02918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.06781v2","updated":"2023-08-13T12:59:43Z","published":"2021-11-12T15:47:10Z","title":"Q-Learning for MDPs with General Spaces: Convergence and Near Optimality\n  via Quantization under Weak Continuity","summary":"  Reinforcement learning algorithms often require finiteness of state and\naction spaces in Markov decision processes (MDPs) (also called controlled\nMarkov chains) and various efforts have been made in the literature towards the\napplicability of such algorithms for continuous state and action spaces. In\nthis paper, we show that under very mild regularity conditions (in particular,\ninvolving only weak continuity of the transition kernel of an MDP), Q-learning\nfor standard Borel MDPs via quantization of states and actions (called\nQuantized Q-Learning) converges to a limit, and furthermore this limit\nsatisfies an optimality equation which leads to near optimality with either\nexplicit performance bounds or which are guaranteed to be asymptotically\noptimal. Our approach builds on (i) viewing quantization as a measurement\nkernel and thus a quantized MDP as a partially observed Markov decision process\n(POMDP), (ii) utilizing near optimality and convergence results of Q-learning\nfor POMDPs, and (iii) finally, near-optimality of finite state model\napproximations for MDPs with weakly continuous kernels which we show to\ncorrespond to the fixed point of the constructed POMDP. Thus, our paper\npresents a very general convergence and approximation result for the\napplicability of Q-learning for continuous MDPs.\n","authors":["Ali Devran Kara","Naci Saldi","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2111.06781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03308v2","updated":"2023-08-13T12:30:29Z","published":"2023-05-05T06:17:57Z","title":"Tiny-PPG: A Lightweight Deep Neural Network for Real-Time Detection of\n  Motion Artifacts in Photoplethysmogram Signals on Edge Devices","summary":"  Photoplethysmogram (PPG) signals are easily contaminated by motion artifacts\nin real-world settings, despite their widespread use in Internet-of-Things\n(IoT) based wearable and smart health devices for cardiovascular health\nmonitoring. This study proposed a lightweight deep neural network, called\nTiny-PPG, for accurate and real-time PPG artifact segmentation on IoT edge\ndevices. The model was trained and tested on a public dataset, PPG DaLiA, which\nfeatured complex artifacts with diverse lengths and morphologies during various\ndaily activities of 15 subjects using a watch-type device (Empatica E4). The\nmodel structure, training method and loss function were specifically designed\nto balance detection accuracy and speed for real-time PPG artifact detection in\nresource-constrained embedded devices. To optimize the model size and\ncapability in multi-scale feature representation, the model employed depth-wise\nseparable convolution and atrous spatial pyramid pooling modules, respectively.\nAdditionally, the contrastive loss was also utilized to further optimize the\nfeature embeddings. With additional model pruning, Tiny-PPG achieved\nstate-of-the-art detection accuracy of 87.4% while only having 19,726 model\nparameters (0.15 megabytes), and was successfully deployed on an STM32 embedded\nsystem for real-time PPG artifact detection. Therefore, this study provides an\neffective solution for resource-constraint IoT smart health devices in PPG\nartifact detection.\n","authors":["Yali Zheng","Chen Wu","Peizheng Cai","Zhiqiang Zhong","Hongda Huang","Yuqi Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.03308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2304.08842v2","updated":"2023-08-13T11:31:34Z","published":"2023-04-18T09:13:52Z","title":"UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite","summary":"  It is seen that there is enormous potential to leverage powerful deep\nlearning methods in the emerging field of urban digital twins. It is\nparticularly in the area of intelligent road inspection where there is\ncurrently limited research and data available. To facilitate progress in this\nfield, we have developed a well-labeled road pothole dataset named Urban\nDigital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this\ndataset will enable the use of powerful deep learning methods in urban road\ninspection, providing algorithms with a more comprehensive understanding of the\nscene and maximizing their potential. Our dataset comprises 1000 images of\npotholes, captured in various scenarios with different lighting and humidity\nconditions. Our intention is to employ this dataset for object detection,\nsemantic segmentation, and instance segmentation tasks. Our team has devoted\nsignificant effort to conducting a detailed statistical analysis, and\nbenchmarking a selection of representative algorithms from recent years. We\nalso provide a multi-task platform for researchers to fully exploit the\nperformance of various algorithms with the support of UDTIRI dataset.\n","authors":["Sicen Guo","Jiahang Li","Shuai Su","Yi Feng","Dacheng Zhou","Chen Chen","Denghuang Zhang","Xingyi Zhu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2304.08842v2.pdf","comment":"Database webpage: https://www.udtiri.com/, Kaggle webpage:\n  https://www.kaggle.com/datasets/jiahangli617/udtiri"},{"id":"http://arxiv.org/abs/2308.06741v1","updated":"2023-08-13T10:18:10Z","published":"2023-08-13T10:18:10Z","title":"Heterogeneous Multi-Agent Reinforcement Learning via Mirror Descent\n  Policy Optimization","summary":"  This paper presents an extension of the Mirror Descent method to overcome\nchallenges in cooperative Multi-Agent Reinforcement Learning (MARL) settings,\nwhere agents have varying abilities and individual policies. The proposed\nHeterogeneous-Agent Mirror Descent Policy Optimization (HAMDPO) algorithm\nutilizes the multi-agent advantage decomposition lemma to enable efficient\npolicy updates for each agent while ensuring overall performance improvements.\nBy iteratively updating agent policies through an approximate solution of the\ntrust-region problem, HAMDPO guarantees stability and improves performance.\nMoreover, the HAMDPO algorithm is capable of handling both continuous and\ndiscrete action spaces for heterogeneous agents in various MARL problems. We\nevaluate HAMDPO on Multi-Agent MuJoCo and StarCraftII tasks, demonstrating its\nsuperiority over state-of-the-art algorithms such as HATRPO and HAPPO. These\nresults suggest that HAMDPO is a promising approach for solving cooperative\nMARL problems and could potentially be extended to address other challenging\nproblems in the field of MARL.\n","authors":["Mohammad Mehdi Nasiri","Mansoor Rezghi"],"pdf_url":"https://arxiv.org/pdf/2308.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02617v2","updated":"2023-08-13T10:11:09Z","published":"2023-06-05T06:31:14Z","title":"Permutation Decision Trees","summary":"  Decision Tree is a well understood Machine Learning model that is based on\nminimizing impurities in the internal nodes. The most common impurity measures\nare Shannon entropy and Gini impurity. These impurity measures are insensitive\nto the order of training data and hence the final tree obtained is invariant to\nany permutation of the data. This leads to a serious limitation in modeling\ndata instances that have order dependencies. In this work, we propose the use\nof Effort-To-Compress (ETC) - a complexity measure, for the first time, as an\nimpurity measure. Unlike Shannon entropy and Gini impurity, structural impurity\nbased on ETC is able to capture order dependencies in the data, thus obtaining\npotentially different decision trees for different permutations of the same\ndata instances (Permutation Decision Trees). We then introduce the notion of\nPermutation Bagging achieved using permutation decision trees without the need\nfor random feature selection and sub-sampling. We compare the performance of\nthe proposed permutation bagged decision trees with Random Forests. Our model\ndoes not assume that the data instances are independent and identically\ndistributed. Potential applications include scenarios where a temporal order\npresent in the data instances is to be respected.\n","authors":["Harikrishnan N B","Nithin Nagaraj"],"pdf_url":"https://arxiv.org/pdf/2306.02617v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.06740v1","updated":"2023-08-13T10:09:25Z","published":"2023-08-13T10:09:25Z","title":"Weighted Sparse Partial Least Squares for Joint Sample and Feature\n  Selection","summary":"  Sparse Partial Least Squares (sPLS) is a common dimensionality reduction\ntechnique for data fusion, which projects data samples from two views by\nseeking linear combinations with a small number of variables with the maximum\nvariance. However, sPLS extracts the combinations between two data sets with\nall data samples so that it cannot detect latent subsets of samples. To extend\nthe application of sPLS by identifying a specific subset of samples and remove\noutliers, we propose an $\\ell_\\infty/\\ell_0$-norm constrained weighted sparse\nPLS ($\\ell_\\infty/\\ell_0$-wsPLS) method for joint sample and feature selection,\nwhere the $\\ell_\\infty/\\ell_0$-norm constrains are used to select a subset of\nsamples. We prove that the $\\ell_\\infty/\\ell_0$-norm constrains have the\nKurdyka-\\L{ojasiewicz}~property so that a globally convergent algorithm is\ndeveloped to solve it. Moreover, multi-view data with a same set of samples can\nbe available in various real problems. To this end, we extend the\n$\\ell_\\infty/\\ell_0$-wsPLS model and propose two multi-view wsPLS models for\nmulti-view data fusion. We develop an efficient iterative algorithm for each\nmulti-view wsPLS model and show its convergence property. As well as numerical\nand biomedical data experiments demonstrate the efficiency of the proposed\nmethods.\n","authors":["Wenwen Min","Taosheng Xu","Chris Ding"],"pdf_url":"https://arxiv.org/pdf/2308.06740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06738v1","updated":"2023-08-13T10:04:13Z","published":"2023-08-13T10:04:13Z","title":"Probabilistic Imputation for Time-series Classification with Missing\n  Data","summary":"  Multivariate time series data for real-world applications typically contain a\nsignificant amount of missing values. The dominant approach for classification\nwith such missing values is to impute them heuristically with specific values\n(zero, mean, values of adjacent time-steps) or learnable parameters. However,\nthese simple strategies do not take the data generative process into account,\nand more importantly, do not effectively capture the uncertainty in prediction\ndue to the multiple possibilities for the missing values. In this paper, we\npropose a novel probabilistic framework for classification with multivariate\ntime series data with missing values. Our model consists of two parts; a deep\ngenerative model for missing value imputation and a classifier. Extending the\nexisting deep generative models to better capture structures of time-series\ndata, our deep generative model part is trained to impute the missing values in\nmultiple plausible ways, effectively modeling the uncertainty of the\nimputation. The classifier part takes the time series data along with the\nimputed missing values and classifies signals, and is trained to capture the\npredictive uncertainty due to the multiple possibilities of imputations.\nImportantly, we show that na\\\"ively combining the generative model and the\nclassifier could result in trivial solutions where the generative model does\nnot produce meaningful imputations. To resolve this, we present a novel\nregularization technique that can promote the model to produce useful\nimputation values that help classification. Through extensive experiments on\nreal-world time series data with missing values, we demonstrate the\neffectiveness of our method.\n","authors":["SeungHyun Kim","Hyunsu Kim","EungGu Yun","Hwangrae Lee","Jaehun Lee","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06733v1","updated":"2023-08-13T09:51:16Z","published":"2023-08-13T09:51:16Z","title":"Precipitation nowcasting with generative diffusion models","summary":"  In recent years traditional numerical methods for accurate weather prediction\nhave been increasingly challenged by deep learning methods. Numerous historical\ndatasets used for short and medium-range weather forecasts are typically\norganized into a regular spatial grid structure. This arrangement closely\nresembles images: each weather variable can be visualized as a map or, when\nconsidering the temporal axis, as a video. Several classes of generative\nmodels, comprising Generative Adversarial Networks, Variational Autoencoders,\nor the recent Denoising Diffusion Models have largely proved their\napplicability to the next-frame prediction problem, and is thus natural to test\ntheir performance on the weather prediction benchmarks. Diffusion models are\nparticularly appealing in this context, due to the intrinsically probabilistic\nnature of weather forecasting: what we are really interested to model is the\nprobability distribution of weather indicators, whose expected value is the\nmost likely prediction.\n  In our study, we focus on a specific subset of the ERA-5 dataset, which\nincludes hourly data pertaining to Central Europe from the years 2016 to 2021.\nWithin this context, we examine the efficacy of diffusion models in handling\nthe task of precipitation nowcasting. Our work is conducted in comparison to\nthe performance of well-established U-Net models, as documented in the existing\nliterature. Our proposed approach of Generative Ensemble Diffusion (GED)\nutilizes a diffusion model to generate a set of possible weather scenarios\nwhich are then amalgamated into a probable prediction via the use of a\npost-processing network. This approach, in comparison to recent deep learning\nmodels, substantially outperformed them in terms of overall performance.\n","authors":["Andrea Asperti","Fabio Merizzi","Alberto Paparella","Giorgio Pedrazzi","Matteo Angelinelli","Stefano Colamonaco"],"pdf_url":"https://arxiv.org/pdf/2308.06733v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/1903.10047v4","updated":"2023-08-13T09:04:34Z","published":"2019-03-24T19:42:39Z","title":"Approximation and Non-parametric Estimation of ResNet-type Convolutional\n  Neural Networks","summary":"  Convolutional neural networks (CNNs) have been shown to achieve optimal\napproximation and estimation error rates (in minimax sense) in several function\nclasses. However, previous analyzed optimal CNNs are unrealistically wide and\ndifficult to obtain via optimization due to sparse constraints in important\nfunction classes, including the H\\\"older class. We show a ResNet-type CNN can\nattain the minimax optimal error rates in these classes in more plausible\nsituations -- it can be dense, and its width, channel size, and filter size are\nconstant with respect to sample size. The key idea is that we can replicate the\nlearning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as\nlong as the FNNs have \\textit{block-sparse} structures. Our theory is general\nin a sense that we can automatically translate any approximation rate achieved\nby block-sparse FNNs into that by CNNs. As an application, we derive\napproximation and estimation error rates of the aformentioned type of CNNs for\nthe Barron and H\\\"older classes with the same strategy.\n","authors":["Kenta Oono","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/1903.10047v4.pdf","comment":"Version 4: Fixed the constant B^{(fc)} in Theorems 1, 5 and the norm\n  upper bound of w^{(l)}_m in Lemma 1. 8 pages + References 2 pages +\n  Supplemental material 18 pages"},{"id":"http://arxiv.org/abs/2212.07462v2","updated":"2023-08-13T08:58:00Z","published":"2022-12-14T19:13:59Z","title":"Harmonic (Quantum) Neural Networks","summary":"  Harmonic functions are abundant in nature, appearing in limiting cases of\nMaxwell's, Navier-Stokes equations, the heat and the wave equation.\nConsequently, there are many applications of harmonic functions from industrial\nprocess optimisation to robotic path planning and the calculation of first exit\ntimes of random walks. Despite their ubiquity and relevance, there have been\nfew attempts to incorporate inductive biases towards harmonic functions in\nmachine learning contexts. In this work, we demonstrate effective means of\nrepresenting harmonic functions in neural networks and extend such results also\nto quantum neural networks to demonstrate the generality of our approach. We\nbenchmark our approaches against (quantum) physics-informed neural networks,\nwhere we show favourable performance.\n","authors":["Atiyo Ghosh","Antonio A. Gentile","Mario Dagrada","Chul Lee","Seong-Hyok Kim","Hyukgeun Cha","Yunjun Choi","Brad Kim","Jeong-Il Kye","Vincent E. Elfving"],"pdf_url":"https://arxiv.org/pdf/2212.07462v2.pdf","comment":"12 pages (main), 7 pages (supplementary), 7 figures"},{"id":"http://arxiv.org/abs/2307.01482v4","updated":"2023-08-13T08:42:08Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n  Forecasting","summary":"  Spatiotemporal graph neural networks (STGNNs) have emerged as a leading\napproach for learning representations and forecasting on traffic datasets with\nunderlying topological and correlational structures. However, current STGNNs\nuse intricate techniques with high complexities to capture these structures,\nmaking them difficult to understand and scale. The existence of simple yet\nefficient architectures remains an open question. Upon closer examination, we\nfind what lies at the core of STGNN's representations are certain forms of\nspatiotemporal contextualization. In light of this, we design Nexus sine qua\nnon (NexuSQN), an essentially connected network built on an efficient\nmessage-passing backbone. NexuSQN simply uses learnable \"where\" and \"when\"\nlocators for the aforementioned contextualization and omits any intricate\ncomponents such as RNNs, Transformers, and diffusion convolutions. Results show\nthat NexuSQN outperforms intricately designed benchmarks in terms of size,\ncomputational efficiency, and accuracy. This suggests a promising future for\ndeveloping simple yet efficient neural predictors.\n","authors":["Tong Nie","Guoyang Qin","Lijun Sun","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.11217v3","updated":"2023-08-13T08:14:24Z","published":"2021-12-18T10:45:31Z","title":"Model-Based Safe Reinforcement Learning with Time-Varying State and\n  Control Constraints: An Application to Intelligent Vehicles","summary":"  Recently, safe reinforcement learning (RL) with the actor-critic structure\nfor continuous control tasks has received increasing attention. It is still\nchallenging to learn a near-optimal control policy with safety and convergence\nguarantees. Also, few works have addressed the safe RL algorithm design under\ntime-varying safety constraints. This paper proposes a safe RL algorithm for\noptimal control of nonlinear systems with time-varying state and control\nconstraints. In the proposed approach, we construct a novel barrier force-based\ncontrol policy structure to guarantee control safety. A multi-step policy\nevaluation mechanism is proposed to predict the policy's safety risk under\ntime-varying safety constraints and guide the policy to update safely.\nTheoretical results on stability and robustness are proven. Also, the\nconvergence of the actor-critic implementation is analyzed. The performance of\nthe proposed algorithm outperforms several state-of-the-art RL algorithms in\nthe simulated Safety Gym environment. Furthermore, the approach is applied to\nthe integrated path following and collision avoidance problem for two\nreal-world intelligent vehicles. A differential-drive vehicle and an\nAckermann-drive one are used to verify offline deployment and online learning\nperformance, respectively. Our approach shows an impressive sim-to-real\ntransfer capability and a satisfactory online control performance in the\nexperiment.\n","authors":["Xinglong Zhang","Yaoqian Peng","Biao Luo","Wei Pan","Xin Xu","Haibin Xie"],"pdf_url":"https://arxiv.org/pdf/2112.11217v3.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.06718v1","updated":"2023-08-13T08:13:34Z","published":"2023-08-13T08:13:34Z","title":"Generalized Independent Noise Condition for Estimating Causal Structure\n  with Latent Variables","summary":"  We investigate the challenging task of learning causal structure in the\npresence of latent variables, including locating latent variables and\ndetermining their quantity, and identifying causal relationships among both\nlatent and observed variables. To address this, we propose a Generalized\nIndependent Noise (GIN) condition for linear non-Gaussian acyclic causal models\nthat incorporate latent variables, which establishes the independence between a\nlinear combination of certain measured variables and some other measured\nvariables. Specifically, for two observed random vectors $\\bf{Y}$ and $\\bf{Z}$,\nGIN holds if and only if $\\omega^{\\intercal}\\mathbf{Y}$ and $\\mathbf{Z}$ are\nindependent, where $\\omega$ is a non-zero parameter vector determined by the\ncross-covariance between $\\mathbf{Y}$ and $\\mathbf{Z}$. We then give necessary\nand sufficient graphical criteria of the GIN condition in linear non-Gaussian\nacyclic causal models. Roughly speaking, GIN implies the existence of an\nexogenous set $\\mathcal{S}$ relative to the parent set of $\\mathbf{Y}$ (w.r.t.\nthe causal ordering), such that $\\mathcal{S}$ d-separates $\\mathbf{Y}$ from\n$\\mathbf{Z}$. Interestingly, we find that the independent noise condition\n(i.e., if there is no confounder, causes are independent of the residual\nderived from regressing the effect on the causes) can be seen as a special case\nof GIN. With such a connection between GIN and latent causal structures, we\nfurther leverage the proposed GIN condition, together with a well-designed\nsearch procedure, to efficiently estimate Linear, Non-Gaussian Latent\nHierarchical Models (LiNGLaHs), where latent confounders may also be causally\nrelated and may even follow a hierarchical structure. We show that the\nunderlying causal structure of a LiNGLaH is identifiable in light of GIN\nconditions under mild assumptions. Experimental results show the effectiveness\nof the proposed approach.\n","authors":["Feng Xie","Biwei Huang","Zhengming Chen","Ruichu Cai","Clark Glymour","Zhi Geng","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06717v1","updated":"2023-08-13T08:12:01Z","published":"2023-08-13T08:12:01Z","title":"Estimating and Incentivizing Imperfect-Knowledge Agents with Hidden\n  Rewards","summary":"  In practice, incentive providers (i.e., principals) often cannot observe the\nreward realizations of incentivized agents, which is in contrast to many\nprincipal-agent models that have been previously studied. This information\nasymmetry challenges the principal to consistently estimate the agent's unknown\nrewards by solely watching the agent's decisions, which becomes even more\nchallenging when the agent has to learn its own rewards. This complex setting\nis observed in various real-life scenarios ranging from renewable energy\nstorage contracts to personalized healthcare incentives. Hence, it offers not\nonly interesting theoretical questions but also wide practical relevance. This\npaper explores a repeated adverse selection game between a self-interested\nlearning agent and a learning principal. The agent tackles a multi-armed bandit\n(MAB) problem to maximize their expected reward plus incentive. On top of the\nagent's learning, the principal trains a parallel algorithm and faces a\ntrade-off between consistently estimating the agent's unknown rewards and\nmaximizing their own utility by offering adaptive incentives to lead the agent.\nFor a non-parametric model, we introduce an estimator whose only input is the\nhistory of principal's incentives and agent's choices. We unite this estimator\nwith a proposed data-driven incentive policy within a MAB framework. Without\nrestricting the type of the agent's algorithm, we prove finite-sample\nconsistency of the estimator and a rigorous regret bound for the principal by\nconsidering the sequential externality imposed by the agent. Lastly, our\ntheoretical results are reinforced by simulations justifying applicability of\nour framework to green energy aggregator contracts.\n","authors":["Ilgin Dogan","Zuo-Jun Max Shen","Anil Aswani"],"pdf_url":"https://arxiv.org/pdf/2308.06717v1.pdf","comment":"72 pages, 6 figures. arXiv admin note: text overlap with\n  arXiv:2304.07407"},{"id":"http://arxiv.org/abs/2308.06714v1","updated":"2023-08-13T08:10:23Z","published":"2023-08-13T08:10:23Z","title":"Learning on Graphs with Out-of-Distribution Nodes","summary":"  Graph Neural Networks (GNNs) are state-of-the-art models for performing\nprediction tasks on graphs. While existing GNNs have shown great performance on\nvarious tasks related to graphs, little attention has been paid to the scenario\nwhere out-of-distribution (OOD) nodes exist in the graph during training and\ninference. Borrowing the concept from CV and NLP, we define OOD nodes as nodes\nwith labels unseen from the training set. Since a lot of networks are\nautomatically constructed by programs, real-world graphs are often noisy and\nmay contain nodes from unknown distributions. In this work, we define the\nproblem of graph learning with out-of-distribution nodes. Specifically, we aim\nto accomplish two tasks: 1) detect nodes which do not belong to the known\ndistribution and 2) classify the remaining nodes to be one of the known\nclasses. We demonstrate that the connection patterns in graphs are informative\nfor outlier detection, and propose Out-of-Distribution Graph Attention Network\n(OODGAT), a novel GNN model which explicitly models the interaction between\ndifferent kinds of nodes and separate inliers from outliers during feature\npropagation. Extensive experiments show that OODGAT outperforms existing\noutlier detection methods by a large margin, while being better or comparable\nin terms of in-distribution classification.\n","authors":["Yu Song","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06714v1.pdf","comment":"Accepted by KDD'22"},{"id":"http://arxiv.org/abs/2308.06709v1","updated":"2023-08-13T07:56:01Z","published":"2023-08-13T07:56:01Z","title":"The Hard-Constraint PINNs for Interface Optimal Control Problems","summary":"  We show that the physics-informed neural networks (PINNs), in combination\nwith some recently developed discontinuity capturing neural networks, can be\napplied to solve optimal control problems subject to partial differential\nequations (PDEs) with interfaces and some control constraints. The resulting\nalgorithm is mesh-free and scalable to different PDEs, and it ensures the\ncontrol constraints rigorously. Since the boundary and interface conditions, as\nwell as the PDEs, are all treated as soft constraints by lumping them into a\nweighted loss function, it is necessary to learn them simultaneously and there\nis no guarantee that the boundary and interface conditions can be satisfied\nexactly. This immediately causes difficulties in tuning the weights in the\ncorresponding loss function and training the neural networks. To tackle these\ndifficulties and guarantee the numerical accuracy, we propose to impose the\nboundary and interface conditions as hard constraints in PINNs by developing a\nnovel neural network architecture. The resulting hard-constraint PINNs approach\nguarantees that both the boundary and interface conditions can be satisfied\nexactly and they are decoupled from the learning of the PDEs. Its efficiency is\npromisingly validated by some elliptic and parabolic interface optimal control\nproblems.\n","authors":["Ming-Chih Lai","Yongcun Song","Xiaoming Yuan","Hangrui Yue","Tianyou Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06708v1","updated":"2023-08-13T07:55:46Z","published":"2023-08-13T07:55:46Z","title":"Generating observation guided ensembles for data assimilation with\n  denoising diffusion probabilistic model","summary":"  This paper presents an ensemble data assimilation method using the pseudo\nensembles generated by denoising diffusion probabilistic model. Since the model\nis trained against noisy and sparse observation data, this model can produce\ndivergent ensembles close to observations. Thanks to the variance in generated\nensembles, our proposed method displays better performance than the\nwell-established ensemble data assimilation method when the simulation model is\nimperfect.\n","authors":["Yuuichi Asahi","Yuta Hasegawa","Naoyuki Onodera","Takashi Shimokawabe","Hayato Shiba","Yasuhiro Idomura"],"pdf_url":"https://arxiv.org/pdf/2308.06708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05124v2","updated":"2023-08-13T07:18:29Z","published":"2022-12-09T21:48:36Z","title":"Multi-view Graph Convolutional Networks with Differentiable Node\n  Selection","summary":"  Multi-view data containing complementary and consensus information can\nfacilitate representation learning by exploiting the intact integration of\nmulti-view features. Because most objects in real world often have underlying\nconnections, organizing multi-view data as heterogeneous graphs is beneficial\nto extracting latent information among different objects. Due to the powerful\ncapability to gather information of neighborhood nodes, in this paper, we apply\nGraph Convolutional Network (GCN) to cope with heterogeneous-graph data\noriginating from multi-view data, which is still under-explored in the field of\nGCN. In order to improve the quality of network topology and alleviate the\ninterference of noises yielded by graph fusion, some methods undertake sorting\noperations before the graph convolution procedure. These GCN-based methods\ngenerally sort and select the most confident neighborhood nodes for each\nvertex, such as picking the top-k nodes according to pre-defined confidence\nvalues. Nonetheless, this is problematic due to the non-differentiable sorting\noperators and inflexible graph embedding learning, which may result in blocked\ngradient computations and undesired performance. To cope with these issues, we\npropose a joint framework dubbed Multi-view Graph Convolutional Network with\nDifferentiable Node Selection (MGCN-DNS), which is constituted of an adaptive\ngraph fusion layer, a graph learning module and a differentiable node selection\nschema. MGCN-DNS accepts multi-channel graph-structural data as inputs and aims\nto learn more robust graph fusion through a differentiable neural network. The\neffectiveness of the proposed method is verified by rigorous comparisons with\nconsiderable state-of-the-art approaches in terms of multi-view semi-supervised\nclassification tasks.\n","authors":["Zhaoliang Chen","Lele Fu","Shunxin Xiao","Shiping Wang","Claudia Plant","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2212.05124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06703v1","updated":"2023-08-13T07:03:22Z","published":"2023-08-13T07:03:22Z","title":"Understanding the robustness difference between stochastic gradient\n  descent and adaptive gradient methods","summary":"  Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam\nand RMSProp, have been widely used in training deep neural networks. We\nempirically show that while the difference between the standard generalization\nperformance of models trained using these methods is small, those trained using\nSGD exhibit far greater robustness under input perturbations. Notably, our\ninvestigation demonstrates the presence of irrelevant frequencies in natural\ndatasets, where alterations do not affect models' generalization performance.\nHowever, models trained with adaptive methods show sensitivity to these\nchanges, suggesting that their use of irrelevant frequencies can lead to\nsolutions sensitive to perturbations. To better understand this difference, we\nstudy the learning dynamics of gradient descent (GD) and sign gradient descent\n(signGD) on a synthetic dataset that mirrors natural signals. With a\nthree-dimensional input space, the models optimized with GD and signGD have\nstandard risks close to zero but vary in their adversarial risks. Our result\nshows that linear models' robustness to $\\ell_2$-norm bounded changes is\ninversely proportional to the model parameters' weight norm: a smaller weight\nnorm implies better robustness. In the context of deep learning, our\nexperiments show that SGD-trained neural networks show smaller Lipschitz\nconstants, explaining the better robustness to input perturbations than those\ntrained with adaptive gradient methods.\n","authors":["Avery Ma","Yangchen Pan","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2308.06703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.06434v3","updated":"2023-08-13T06:55:39Z","published":"2022-06-13T19:28:42Z","title":"SmartGD: A GAN-Based Graph Drawing Framework for Diverse Aesthetic Goals","summary":"  While a multitude of studies have been conducted on graph drawing, many\nexisting methods only focus on optimizing a single aesthetic aspect of graph\nlayouts, which can lead to sub-optimal results. There are a few existing\nmethods that have attempted to develop a flexible solution for optimizing\ndifferent aesthetic aspects measured by different aesthetic criteria.\nFurthermore, thanks to the significant advance in deep learning techniques,\nseveral deep learning-based layout methods were proposed recently. These\nmethods have demonstrated the advantages of deep learning approaches for graph\ndrawing. However, none of these existing methods can be directly applied to\noptimizing non-differentiable criteria without special accommodation. In this\nwork, we propose a novel Generative Adversarial Network (GAN) based deep\nlearning framework for graph drawing, called SmartGD, which can optimize\ndifferent quantitative aesthetic goals, regardless of their differentiability.\nTo demonstrate the effectiveness and efficiency of SmartGD, we conducted\nexperiments on minimizing stress, minimizing edge crossing, maximizing crossing\nangle, maximizing shape-based metrics, and a combination of multiple\naesthetics. Compared with several popular graph drawing algorithms, the\nexperimental results show that SmartGD achieves good performance both\nquantitatively and qualitatively.\n","authors":["Xiaoqi Wang","Kevin Yen","Yifan Hu","Han-Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2206.06434v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06701v1","updated":"2023-08-13T06:55:05Z","published":"2023-08-13T06:55:05Z","title":"Camouflaged Image Synthesis Is All You Need to Boost Camouflaged\n  Detection","summary":"  Camouflaged objects that blend into natural scenes pose significant\nchallenges for deep-learning models to detect and synthesize. While camouflaged\nobject detection is a crucial task in computer vision with diverse real-world\napplications, this research topic has been constrained by limited data\navailability. We propose a framework for synthesizing camouflage data to\nenhance the detection of camouflaged objects in natural scenes. Our approach\nemploys a generative model to produce realistic camouflage images, which can be\nused to train existing object detection models. Specifically, we use a\ncamouflage environment generator supervised by a camouflage distribution\nclassifier to synthesize the camouflage images, which are then fed into our\ngenerator to expand the dataset. Our framework outperforms the current\nstate-of-the-art method on three datasets (COD10k, CAMO, and CHAMELEON),\ndemonstrating its effectiveness in improving camouflaged object detection. This\napproach can serve as a plug-and-play data generation and augmentation module\nfor existing camouflaged object detection tasks and provides a novel way to\nintroduce more diversity and distributions into current camouflage datasets.\n","authors":["Haichao Zhang","Can Qin","Yu Yin","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.06701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05034v2","updated":"2023-08-13T06:36:34Z","published":"2023-08-09T16:04:55Z","title":"Kairos: Practical Intrusion Detection and Investigation using\n  Whole-system Provenance","summary":"  Provenance graphs are structured audit logs that describe the history of a\nsystem's execution. Recent studies have explored a variety of techniques to\nanalyze provenance graphs for automated host intrusion detection, focusing\nparticularly on advanced persistent threats. Sifting through their design\ndocuments, we identify four common dimensions that drive the development of\nprovenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect\nmodern attacks that infiltrate across application boundaries?), attack\nagnosticity (can PIDSes detect novel attacks without a priori knowledge of\nattack characteristics?), timeliness (can PIDSes efficiently monitor host\nsystems as they run?), and attack reconstruction (can PIDSes distill attack\nactivity from large provenance graphs so that sysadmins can easily understand\nand quickly respond to system intrusion?). We present KAIROS, the first PIDS\nthat simultaneously satisfies the desiderata in all four dimensions, whereas\nexisting approaches sacrifice at least one and struggle to achieve comparable\ndetection performance.\n  Kairos leverages a novel graph neural network-based encoder-decoder\narchitecture that learns the temporal evolution of a provenance graph's\nstructural changes to quantify the degree of anomalousness for each system\nevent. Then, based on this fine-grained information, Kairos reconstructs attack\nfootprints, generating compact summary graphs that accurately describe\nmalicious activity over a stream of system audit logs. Using state-of-the-art\nbenchmark datasets, we demonstrate that Kairos outperforms previous approaches.\n","authors":["Zijun Cheng","Qiujian Lv","Jinyuan Liang","Yan Wang","Degang Sun","Thomas Pasquier","Xueyuan Han"],"pdf_url":"https://arxiv.org/pdf/2308.05034v2.pdf","comment":"23 pages, 16 figures, to appear in the 45th IEEE Symposium on\n  Security and Privacy (S&P'24)"},{"id":"http://arxiv.org/abs/2211.06077v3","updated":"2023-08-13T06:23:46Z","published":"2022-11-11T09:16:25Z","title":"Overparameterized random feature regression with nearly orthogonal data","summary":"  We investigate the properties of random feature ridge regression (RFRR) given\nby a two-layer neural network with random Gaussian initialization. We study the\nnon-asymptotic behaviors of the RFRR with nearly orthogonal deterministic\nunit-length input data vectors in the overparameterized regime, where the width\nof the first layer is much larger than the sample size. Our analysis shows\nhigh-probability non-asymptotic concentration results for the training errors,\ncross-validations, and generalization errors of RFRR centered around their\nrespective values for a kernel ridge regression (KRR). This KRR is derived from\nan expected kernel generated by a nonlinear random feature map. We then\napproximate the performance of the KRR by a polynomial kernel matrix obtained\nfrom the Hermite polynomial expansion of the activation function, whose degree\nonly depends on the orthogonality among different data points. This polynomial\nkernel determines the asymptotic behavior of the RFRR and the KRR. Our results\nhold for a wide variety of activation functions and input data sets that\nexhibit nearly orthogonal properties. Based on these approximations, we obtain\na lower bound for the generalization error of the RFRR for a nonlinear\nstudent-teacher model.\n","authors":["Zhichao Wang","Yizhe Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06077v3.pdf","comment":"39 pages. A condition on the activation function is added in\n  Assumption 2.2"},{"id":"http://arxiv.org/abs/2212.02234v2","updated":"2023-08-13T06:02:06Z","published":"2022-11-13T03:23:54Z","title":"Review of medical data analysis based on spiking neural networks","summary":"  Medical data mainly includes various types of biomedical signals and medical\nimages, which can be used by professional doctors to make judgments on\npatients' health conditions. However, the interpretation of medical data\nrequires a lot of human cost and there may be misjudgments, so many scholars\nuse neural networks and deep learning to classify and study medical data, which\ncan improve the efficiency and accuracy of doctors and detect diseases early\nfor early diagnosis, etc. Therefore, it has a wide range of application\nprospects. However, traditional neural networks have disadvantages such as high\nenergy consumption and high latency (slow computation speed). This paper\npresents recent research on signal classification and disease diagnosis based\non a third-generation neural network, the spiking neuron network, using medical\ndata including EEG signals, ECG signals, EMG signals and MRI images. The\nadvantages and disadvantages of pulsed neural networks compared with\ntraditional networks are summarized and its development orientation in the\nfuture is prospected.\n","authors":["X. Li","X. Zhang","X. Yi","D. Liu","H. Wang","B. Zhang","B. Zhang","D. Zhao","L. Wang"],"pdf_url":"https://arxiv.org/pdf/2212.02234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06692v1","updated":"2023-08-13T05:56:36Z","published":"2023-08-13T05:56:36Z","title":"SimMatchV2: Semi-Supervised Learning with Graph Consistency","summary":"  Semi-Supervised image classification is one of the most fundamental problem\nin computer vision, which significantly reduces the need for human labor. In\nthis paper, we introduce a new semi-supervised learning algorithm - SimMatchV2,\nwhich formulates various consistency regularizations between labeled and\nunlabeled data from the graph perspective. In SimMatchV2, we regard the\naugmented view of a sample as a node, which consists of a label and its\ncorresponding representation. Different nodes are connected with the edges,\nwhich are measured by the similarity of the node representations. Inspired by\nthe message passing and node classification in graph theory, we propose four\ntypes of consistencies, namely 1) node-node consistency, 2) node-edge\nconsistency, 3) edge-edge consistency, and 4) edge-node consistency. We also\nuncover that a simple feature normalization can reduce the gaps of the feature\nnorm between different augmented views, significantly improving the performance\nof SimMatchV2. Our SimMatchV2 has been validated on multiple semi-supervised\nlearning benchmarks. Notably, with ResNet-50 as our backbone and 300 epochs of\ntraining, SimMatchV2 achieves 71.9\\% and 76.2\\% Top-1 Accuracy with 1\\% and\n10\\% labeled examples on ImageNet, which significantly outperforms the previous\nmethods and achieves state-of-the-art performance. Code and pre-trained models\nare available at\n\\href{https://github.com/mingkai-zheng/SimMatchV2}{https://github.com/mingkai-zheng/SimMatchV2}.\n","authors":["Mingkai Zheng","Shan You","Lang Huang","Chen Luo","Fei Wang","Chen Qian","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.06692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08957v2","updated":"2023-08-13T05:53:08Z","published":"2023-01-21T14:33:02Z","title":"Slice Transformer and Self-supervised Learning for 6DoF Localization in\n  3D Point Cloud Maps","summary":"  Precise localization is critical for autonomous vehicles. We present a\nself-supervised learning method that employs Transformers for the first time\nfor the task of outdoor localization using LiDAR data. We propose a pre-text\ntask that reorganizes the slices of a $360^\\circ$ LiDAR scan to leverage its\naxial properties. Our model, called Slice Transformer, employs multi-head\nattention while systematically processing the slices. To the best of our\nknowledge, this is the first instance of leveraging multi-head attention for\noutdoor point clouds. We additionally introduce the Perth-WA dataset, which\nprovides a large-scale LiDAR map of Perth city in Western Australia, covering\n$\\sim$4km$^2$ area. Localization annotations are provided for Perth-WA. The\nproposed localization method is thoroughly evaluated on Perth-WA and\nAppollo-SouthBay datasets. We also establish the efficacy of our\nself-supervised learning approach for the common downstream task of object\nclassification using ModelNet40 and ScanNN datasets. The code and Perth-WA data\nwill be publicly released.\n","authors":["Muhammad Ibrahim","Naveed Akhtar","Saeed Anwar","Michael Wise","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2301.08957v2.pdf","comment":"Accepted in IEEE International Conference on Robotics and Automation\n  (ICRA), 2023"},{"id":"http://arxiv.org/abs/2202.00916v4","updated":"2023-08-13T05:44:37Z","published":"2022-02-02T08:36:10Z","title":"Scalable Decision-Focused Learning in Restless Multi-Armed Bandits with\n  Application to Maternal and Child Health","summary":"  This paper studies restless multi-armed bandit (RMAB) problems with unknown\narm transition dynamics but with known correlated arm features. The goal is to\nlearn a model to predict transition dynamics given features, where the Whittle\nindex policy solves the RMAB problems using predicted transitions. However,\nprior works often learn the model by maximizing the predictive accuracy instead\nof final RMAB solution quality, causing a mismatch between training and\nevaluation objectives. To address this shortcoming, we propose a novel approach\nfor decision-focused learning in RMAB that directly trains the predictive model\nto maximize the Whittle index solution quality. We present three key\ncontributions: (i) we establish differentiability of the Whittle index policy\nto support decision-focused learning; (ii) we significantly improve the\nscalability of decision-focused learning approaches in sequential problems,\nspecifically RMAB problems; (iii) we apply our algorithm to a previously\ncollected dataset of maternal and child health to demonstrate its performance.\nIndeed, our algorithm is the first for decision-focused learning in RMAB that\nscales to real-world problem sizes.\n","authors":["Kai Wang","Shresth Verma","Aditya Mate","Sanket Shah","Aparna Taneja","Neha Madhiwalla","Aparna Hegde","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2202.00916v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06686v1","updated":"2023-08-13T05:22:49Z","published":"2023-08-13T05:22:49Z","title":"MDB: Interactively Querying Datasets and Models","summary":"  As models are trained and deployed, developers need to be able to\nsystematically debug errors that emerge in the machine learning pipeline. We\npresent MDB, a debugging framework for interactively querying datasets and\nmodels. MDB integrates functional programming with relational algebra to build\nexpressive queries over a database of datasets and model predictions. Queries\nare reusable and easily modified, enabling debuggers to rapidly iterate and\nrefine queries to discover and characterize errors and model behaviors. We\nevaluate MDB on object detection, bias discovery, image classification, and\ndata imputation tasks across self-driving videos, large language models, and\nmedical records. Our experiments show that MDB enables up to 10x faster and\n40\\% shorter queries than other baselines. In a user study, we find developers\ncan successfully construct complex queries that describe errors of machine\nlearning models.\n","authors":["Aaditya Naik","Adam Stein","Yinjun Wu","Eric Wong","Mayur Naik"],"pdf_url":"https://arxiv.org/pdf/2308.06686v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.06853v1","updated":"2023-08-13T22:14:01Z","published":"2023-08-13T22:14:01Z","title":"UGC Quality Assessment: Exploring the Impact of Saliency in Deep\n  Feature-Based Quality Assessment","summary":"  The volume of User Generated Content (UGC) has increased in recent years. The\nchallenge with this type of content is assessing its quality. So far, the\nstate-of-the-art metrics are not exhibiting a very high correlation with\nperceptual quality. In this paper, we explore state-of-the-art metrics that\nextract/combine natural scene statistics and deep neural network features. We\nexperiment with these by introducing saliency maps to improve perceptibility.\nWe train and test our models using public datasets, namely, YouTube-UGC and\nKoNViD-1k. Preliminary results indicate that high correlations are achieved by\nusing only deep features while adding saliency is not always boosting the\nperformance. Our results and code will be made publicly available to serve as a\nbenchmark for the research community and can be found on our project page:\nhttps://github.com/xinyiW915/SPIE-2023-Supplementary.\n","authors":["Xinyi Wang","Angeliki Katsenou","David Bull"],"pdf_url":"https://arxiv.org/pdf/2308.06853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06725v1","updated":"2023-08-13T09:05:56Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":"  Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\n\\url{https://yuyangyin.github.io/CLEDiffusion/}\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06696v1","updated":"2023-08-13T06:29:38Z","published":"2023-08-13T06:29:38Z","title":"MACO: A Modality Adversarial and Contrastive Framework for\n  Modality-missing Multi-modal Knowledge Graph Completion","summary":"  Recent years have seen significant advancements in multi-modal knowledge\ngraph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by\nintegrating multi-modal entity information, thereby facilitating the discovery\nof unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,\nexisting methods emphasize the design of elegant KGC models to facilitate\nmodality interaction, neglecting the real-life problem of missing modalities in\nKGs. The missing modality information impedes modal interaction, consequently\nundermining the model's performance. In this paper, we propose a modality\nadversarial and contrastive framework (MACO) to solve the modality-missing\nproblem in MMKGC. MACO trains a generator and discriminator adversarially to\ngenerate missing modality features that can be incorporated into the MMKGC\nmodel. Meanwhile, we design a cross-modal contrastive loss to improve the\nperformance of the generator. Experiments on public benchmarks with further\nexplorations demonstrate that MACO could achieve state-of-the-art results and\nserve as a versatile framework to bolster various MMKGC models. Our code and\nbenchmark data are available at https://github.com/zjukg/MACO.\n","authors":["Yichi Zhang","Zhuo Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06696v1.pdf","comment":"This is the ArXiv version of our paper accepted by NLPCC 2023. The\n  code will be released soon"}]},"2023-08-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2303.13475v2","updated":"2023-08-12T23:51:53Z","published":"2023-03-20T16:53:36Z","title":"Learning Semantic Text Similarity to rank Hypernyms of Financial Terms","summary":"  Over the years, there has been a paradigm shift in how users access financial\nservices. With the advancement of digitalization more users have been\npreferring the online mode of performing financial activities. This has led to\nthe generation of a huge volume of financial content. Most investors prefer to\ngo through these contents before making decisions. Every industry has terms\nthat are specific to the domain it operates in. Banking and Financial Services\nare not an exception to this. In order to fully comprehend these contents, one\nneeds to have a thorough understanding of the financial terms. Getting a basic\nidea about a term becomes easy when it is explained with the help of the broad\ncategory to which it belongs. This broad category is referred to as hypernym.\nFor example, \"bond\" is a hypernym of the financial term \"alternative\ndebenture\". In this paper, we propose a system capable of extracting and\nranking hypernyms for a given financial term. The system has been trained with\nfinancial text corpora obtained from various sources like DBpedia [4],\nInvestopedia, Financial Industry Business Ontology (FIBO), prospectus and so\non. Embeddings of these terms have been extracted using FinBERT [3], FinISH [1]\nand fine-tuned using SentenceBERT [54]. A novel approach has been used to\naugment the training set with negative samples. It uses the hierarchy present\nin FIBO. Finally, we benchmark the system performance with that of the existing\nones. We establish that it performs better than the existing ones and is also\nscalable.\n","authors":["Sohom Ghosh","Ankush Chopra","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2303.13475v2.pdf","comment":"Our code base:\n  https://github.com/sohomghosh/FinSim_Financial_Hypernym_detection"},{"id":"http://arxiv.org/abs/2303.07274v4","updated":"2023-08-12T22:37:31Z","published":"2023-03-13T16:49:43Z","title":"Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of\n  Synthetic and Compositional Images","summary":"  Weird, unusual, and uncanny images pique the curiosity of observers because\nthey challenge commonsense. For example, an image released during the 2022\nworld cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo\nplaying chess, which playfully violates our expectation that their competition\nshould occur on the football field. Humans can easily recognize and interpret\nthese unconventional images, but can AI models do the same? We introduce\nWHOOPS!, a new dataset and benchmark for visual commonsense. The dataset is\ncomprised of purposefully commonsense-defying images created by designers using\npublicly-available image generation tools like Midjourney. We consider several\ntasks posed over the dataset. In addition to image captioning, cross-modal\nmatching, and visual question answering, we introduce a difficult explanation\ngeneration task, where models must identify and explain why a given image is\nunusual. Our results show that state-of-the-art models such as GPT3 and BLIP2\nstill lag behind human performance on WHOOPS!. We hope our dataset will inspire\nthe development of AI models with stronger visual commonsense reasoning\nabilities. Data, models and code are available at the project website:\nwhoops-benchmark.github.io\n","authors":["Nitzan Bitton-Guetta","Yonatan Bitton","Jack Hessel","Ludwig Schmidt","Yuval Elovici","Gabriel Stanovsky","Roy Schwartz"],"pdf_url":"https://arxiv.org/pdf/2303.07274v4.pdf","comment":"Accepted to ICCV 2023. Website: whoops-benchmark.github.io"},{"id":"http://arxiv.org/abs/2307.09702v3","updated":"2023-08-12T21:09:44Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":"  In this article we show how the problem of neural text generation can be\nconstructively reformulated in terms of transitions between the states of a\nfinite-state machine. This framework leads to an efficient approach to guiding\ntext generation with regular expressions and context-free grammars by allowing\nthe construction of an index over a language model's vocabulary. The approach\nis model agnostic, allows one to enforce domain-specific knowledge and\nconstraints, and enables the construction of reliable interfaces by\nguaranteeing the structure of the generated text. It adds little overhead to\nthe token sequence generation process and significantly outperforms existing\nsolutions. An implementation is provided in the open source Python library\nOutlines\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06610v1","updated":"2023-08-12T16:56:55Z","published":"2023-08-12T16:56:55Z","title":"Bio-SIEVE: Exploring Instruction Tuning Large Language Models for\n  Systematic Review Automation","summary":"  Medical systematic reviews can be very costly and resource intensive. We\nexplore how Large Language Models (LLMs) can support and be trained to perform\nliterature screening when provided with a detailed set of selection criteria.\nSpecifically, we instruction tune LLaMA and Guanaco models to perform abstract\nscreening for medical systematic reviews. Our best model, Bio-SIEVE,\noutperforms both ChatGPT and trained traditional approaches, and generalises\nbetter across medical domains. However, there remains the challenge of adapting\nthe model to safety-first scenarios. We also explore the impact of multi-task\ntraining with Bio-SIEVE-Multi, including tasks such as PICO extraction and\nexclusion reasoning, but find that it is unable to match single-task\nBio-SIEVE's performance. We see Bio-SIEVE as an important step towards\nspecialising LLMs for the biomedical systematic review process and explore its\nfuture developmental opportunities. We release our models, code and a list of\nDOIs to reconstruct our dataset for reproducibility.\n","authors":["Ambrose Robinson","William Thorne","Ben P. Wu","Abdullah Pandor","Munira Essat","Mark Stevenson","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07340v5","updated":"2023-08-12T15:45:07Z","published":"2021-06-02T02:43:18Z","title":"MathBERT: A Pre-trained Language Model for General NLP Tasks in\n  Mathematics Education","summary":"  Since the introduction of the original BERT (i.e., BASE BERT), researchers\nhave developed various customized BERT models with improved performance for\nspecific domains and tasks by exploiting the benefits of transfer learning. Due\nto the nature of mathematical texts, which often use domain specific vocabulary\nalong with equations and math symbols, we posit that the development of a new\nBERT model for mathematics would be useful for many mathematical downstream\ntasks. In this resource paper, we introduce our multi-institutional effort\n(i.e., two learning platforms and three academic institutions in the US) toward\nthis need: MathBERT, a model created by pre-training the BASE BERT model on a\nlarge mathematical corpus ranging from pre-kindergarten (pre-k), to\nhigh-school, to college graduate level mathematical content. In addition, we\nselect three general NLP tasks that are often used in mathematics education:\nprediction of knowledge component, auto-grading open-ended Q&A, and knowledge\ntracing, to demonstrate the superiority of MathBERT over BASE BERT. Our\nexperiments show that MathBERT outperforms prior best methods by 1.2-22% and\nBASE BERT by 2-8% on these tasks. In addition, we build a mathematics specific\nvocabulary 'mathVocab' to train with MathBERT. We discover that MathBERT\npre-trained with 'mathVocab' outperforms MathBERT trained with the BASE BERT\nvocabulary (i.e., 'origVocab'). MathBERT is currently being adopted at the\nparticipated leaning platforms: Stride, Inc, a commercial educational resource\nprovider, and ASSISTments.org, a free online educational platform. We release\nMathBERT for public usage at: https://github.com/tbs17/MathBERT.\n","authors":["Jia Tracy Shen","Michiharu Yamashita","Ethan Prihar","Neil Heffernan","Xintao Wu","Ben Graff","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2106.07340v5.pdf","comment":"Accepted by NeurIPS 2021 MATHAI4ED Workshop (Best Paper)"},{"id":"http://arxiv.org/abs/2308.06595v1","updated":"2023-08-12T15:27:51Z","published":"2023-08-12T15:27:51Z","title":"VisIT-Bench: A Benchmark for Vision-Language Instruction Following\n  Inspired by Real-World Use","summary":"  We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for\nevaluation of instruction-following vision-language models for real-world use.\nOur starting point is curating 70 'instruction families' that we envision\ninstruction tuned vision-language models should be able to address. Extending\nbeyond evaluations like VQAv2 and COCO, tasks range from basic recognition to\ngame playing and creative generation. Following curation, our dataset comprises\n592 test queries, each with a human-authored instruction-conditioned caption.\nThese descriptions surface instruction-specific factors, e.g., for an\ninstruction asking about the accessibility of a storefront for wheelchair\nusers, the instruction-conditioned caption describes ramps/potential obstacles.\nThese descriptions enable 1) collecting human-verified reference outputs for\neach instance; and 2) automatic evaluation of candidate multimodal generations\nusing a text-only LLM, aligning with human judgment. We quantify quality gaps\nbetween models and references using both human and automatic evaluations; e.g.,\nthe top-performing instruction-following model wins against the GPT-4 reference\nin just 27% of the comparison. VisIT-Bench is dynamic to participate,\npractitioners simply submit their model's response on the project website;\nData, code and leaderboard is available at visit-bench.github.io.\n","authors":["Yonatan Bitton","Hritik Bansal","Jack Hessel","Rulin Shao","Wanrong Zhu","Anas Awadalla","Josh Gardner","Rohan Taori","Ludwig Schimdt"],"pdf_url":"https://arxiv.org/pdf/2308.06595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03987v2","updated":"2023-08-12T14:57:37Z","published":"2023-07-08T14:25:57Z","title":"A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of\n  LLMs by Validating Low-Confidence Generation","summary":"  Recently developed large language models have achieved remarkable success in\ngenerating fluent and coherent text. However, these models often tend to\n'hallucinate' which critically hampers their reliability. In this work, we\naddress this crucial problem and propose an approach that actively detects and\nmitigates hallucinations during the generation process. Specifically, we first\nidentify the candidates of potential hallucination leveraging the model's logit\noutput values, check their correctness through a validation procedure, mitigate\nthe detected hallucinations, and then continue with the generation process.\nThrough extensive experiments with GPT-3.5 (text-davinci-003) on the 'article\ngeneration task', we first demonstrate the individual efficacy of our detection\nand mitigation techniques. Specifically, the detection technique achieves a\nrecall of ~88% and the mitigation technique successfully mitigates 57.6% of the\ncorrectly detected hallucinations. Importantly, our mitigation technique does\nnot introduce new hallucinations even in the case of incorrectly detected\nhallucinations, i.e., false positives. Then, we show that the proposed active\ndetection and mitigation approach successfully reduces the hallucinations of\nthe GPT-3.5 model from 47.5% to 14.5% on average. We further demonstrate the\neffectiveness and wide applicability of our approach through additional studies\nincluding performance on different types of questions (multi-hop and false\npremise questions) and with another LLM from a different model family (Vicuna).\nIn summary, our work contributes to improving the reliability and\ntrustworthiness of large language models, a crucial step en route to enabling\ntheir widespread adoption in real-world applications.\n","authors":["Neeraj Varshney","Wenlin Yao","Hongming Zhang","Jianshu Chen","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03987v2.pdf","comment":"update to include additional experiments"},{"id":"http://arxiv.org/abs/2308.06552v1","updated":"2023-08-12T12:38:10Z","published":"2023-08-12T12:38:10Z","title":"MT4CrossOIE: Multi-stage Tuning for Cross-lingual Open Information\n  Extraction","summary":"  Cross-lingual open information extraction aims to extract structured\ninformation from raw text across multiple languages. Previous work uses a\nshared cross-lingual pre-trained model to handle the different languages but\nunderuses the potential of the language-specific representation. In this paper,\nwe propose an effective multi-stage tuning framework called MT4CrossIE,\ndesigned for enhancing cross-lingual open information extraction by injecting\nlanguage-specific knowledge into the shared model. Specifically, the\ncross-lingual pre-trained model is first tuned in a shared semantic space\n(e.g., embedding matrix) in the fixed encoder and then other components are\noptimized in the second stage. After enough training, we freeze the pre-trained\nmodel and tune the multiple extra low-rank language-specific modules using\nmixture-of-LoRAs for model-based cross-lingual transfer. In addition, we\nleverage two-stage prompting to encourage the large language model (LLM) to\nannotate the multi-lingual raw data for data-based cross-lingual transfer. The\nmodel is trained with multi-lingual objectives on our proposed dataset\nOpenIE4++ by combing the model-based and data-based transfer techniques.\nExperimental results on various benchmarks emphasize the importance of\naggregating multiple plug-in-and-play language-specific modules and demonstrate\nthe effectiveness of MT4CrossIE in cross-lingual\nOIE\\footnote{\\url{https://github.com/CSJianYang/Multilingual-Multimodal-NLP}}.\n","authors":["Zixiang Wang","Linzheng Chai","Jian Yang","Jiaqi Bai","Yuwei Yin","Jiaheng Liu","Hongcheng Guo","Tongliang Li","Liqun Yang","Hebboul Zine el-abidine","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2308.06552v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.06547v1","updated":"2023-08-12T12:13:52Z","published":"2023-08-12T12:13:52Z","title":"Alternative Pseudo-Labeling for Semi-Supervised Automatic Speech\n  Recognition","summary":"  When labeled data is insufficient, semi-supervised learning with the\npseudo-labeling technique can significantly improve the performance of\nautomatic speech recognition. However, pseudo-labels are often noisy,\ncontaining numerous incorrect tokens. Taking noisy labels as ground-truth in\nthe loss function results in suboptimal performance. Previous works attempted\nto mitigate this issue by either filtering out the nosiest pseudo-labels or\nimproving the overall quality of pseudo-labels. While these methods are\neffective to some extent, it is unrealistic to entirely eliminate incorrect\ntokens in pseudo-labels. In this work, we propose a novel framework named\nalternative pseudo-labeling to tackle the issue of noisy pseudo-labels from the\nperspective of the training objective. The framework comprises several\ncomponents. Firstly, a generalized CTC loss function is introduced to handle\nnoisy pseudo-labels by accepting alternative tokens in the positions of\nincorrect tokens. Applying this loss function in pseudo-labeling requires\ndetecting incorrect tokens in the predicted pseudo-labels. In this work, we\nadopt a confidence-based error detection method that identifies the incorrect\ntokens by comparing their confidence scores with a given threshold, thus\nnecessitating the confidence score to be discriminative. Hence, the second\nproposed technique is the contrastive CTC loss function that widens the\nconfidence gap between the correctly and incorrectly predicted tokens, thereby\nimproving the error detection ability. Additionally, obtaining satisfactory\nperformance with confidence-based error detection typically requires extensive\nthreshold tuning. Instead, we propose an automatic thresholding method that\nuses labeled data as a proxy for determining the threshold, thus saving the\npain of manual tuning.\n","authors":["Han Zhu","Dongji Gao","Gaofeng Cheng","Daniel Povey","Pengyuan Zhang","Yonghong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06547v1.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech and Language\n  Processing (TASLP), 2023"},{"id":"http://arxiv.org/abs/2308.06546v1","updated":"2023-08-12T12:03:41Z","published":"2023-08-12T12:03:41Z","title":"MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction","summary":"  Extracting meaningful drug-related information chunks, such as adverse drug\nevents (ADE), is crucial for preventing morbidity and saving many lives. Most\nADE are reported via an unstructured conversation with the medical context.\nHence, applying a general entity recognition approach is not sufficient enough.\nThe key is how to integrate and align multiple crucial aspects to detect drug\nevent information, including drug event semantics, syntactic structures, and\nmedical domain terminology. In this paper, we propose a new multi-aspect\ncross-integration framework for drug entity/event detection by capturing and\naligning different context/language/knowledge properties from drug-related\ndocuments. We first construct multi-aspect encoders to describe semantic,\nsyntactic, and medical document contextual information by conducting those slot\ntagging tasks, main drug entity/event detection, part-of-speech tagging, and\ngeneral medical named entity recognition. Then, each encoder conducts cross\nintegration and alignment with other contextual information in three ways,\nincluding the key-value cross, attention cross, and feedforward cross, so the\nmulti-encoders are integrated in depth. Then, we perform extensive experiments\non two widely used drug-related entity recognition downstream tasks, flat\nentity detection and discontinuous event extraction. Our model significantly\noutperforms all recent twelve state-of-the-art models. The implementation code\nwill be released at~\\url{https://github.com/adlnlp/mc-dre}.\n","authors":["Jie Yang","Soyeon Caren Han","Siqu Long","Josiah Poon","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.06546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06527v1","updated":"2023-08-12T11:00:59Z","published":"2023-08-12T11:00:59Z","title":"With a Little Help from the Authors: Reproducing Human Evaluation of an\n  MT Error Detector","summary":"  This work presents our efforts to reproduce the results of the human\nevaluation experiment presented in the paper of Vamvas and Sennrich (2022),\nwhich evaluated an automatic system detecting over- and undertranslations\n(translations containing more or less information than the original) in machine\ntranslation (MT) outputs. Despite the high quality of the documentation and\ncode provided by the authors, we discuss some problems we found in reproducing\nthe exact experimental setup and offer recommendations for improving\nreproducibility. Our replicated results generally confirm the conclusions of\nthe original study, but in some cases, statistically significant differences\nwere observed, suggesting a high variability of human annotation.\n","authors":["Ondřej Plátek","Mateusz Lango","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2308.06527v1.pdf","comment":"Submitted to\n  https://www.aclweb.org/portal/content/repronlp-shared-task-reproducibility-evaluations-nlp-2023"},{"id":"http://arxiv.org/abs/2208.00463v2","updated":"2023-08-12T10:59:56Z","published":"2022-07-31T16:23:23Z","title":"Mismatching-Aware Unsupervised Translation Quality Estimation For\n  Low-Resource Languages","summary":"  Translation Quality Estimation (QE) is the task of predicting the quality of\nmachine translation (MT) output without any reference. This task has gained\nincreasing attention as an important component in the practical applications of\nMT. In this paper, we first propose XLMRScore, which is a cross-lingual\ncounterpart of BERTScore computed via the XLM-RoBERTa (XLMR) model. This metric\ncan be used as a simple unsupervised QE method, while employing it results in\ntwo issues: firstly, the untranslated tokens leading to unexpectedly high\ntranslation scores, and secondly, the issue of mismatching errors between\nsource and hypothesis tokens when applying the greedy matching in XLMRScore. To\nmitigate these issues, we suggest replacing untranslated words with the unknown\ntoken and the cross-lingual alignment of the pre-trained model to represent\naligned words closer to each other, respectively. We evaluate the proposed\nmethod on four low-resource language pairs of WMT21 QE shared task, as well as\na new English-Farsi test dataset introduced in this paper. Experiments show\nthat our method could get comparable results with the supervised baseline for\ntwo zero-shot scenarios, i.e., with less than 0.01 difference in Pearson\ncorrelation, while outperforming unsupervised rivals in all the low-resource\nlanguage pairs for above 8%, on average.\n","authors":["Fatemeh Azadi","Heshaam Faili","Mohammad Javad Dousti"],"pdf_url":"https://arxiv.org/pdf/2208.00463v2.pdf","comment":"Submitted to Language Resources and Evaluation"},{"id":"http://arxiv.org/abs/2305.11095v2","updated":"2023-08-12T09:53:41Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n  Task Generalization","summary":"  We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v2.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.06512v1","updated":"2023-08-12T09:31:43Z","published":"2023-08-12T09:31:43Z","title":"HyperFormer: Enhancing Entity and Relation Interaction for\n  Hyper-Relational Knowledge Graph Completion","summary":"  Hyper-relational knowledge graphs (HKGs) extend standard knowledge graphs by\nassociating attribute-value qualifiers to triples, which effectively represent\nadditional fine-grained information about its associated triple.\nHyper-relational knowledge graph completion (HKGC) aims at inferring unknown\ntriples while considering its qualifiers. Most existing approaches to HKGC\nexploit a global-level graph structure to encode hyper-relational knowledge\ninto the graph convolution message passing process. However, the addition of\nmulti-hop information might bring noise into the triple prediction process. To\naddress this problem, we propose HyperFormer, a model that considers\nlocal-level sequential information, which encodes the content of the entities,\nrelations and qualifiers of a triple. More precisely, HyperFormer is composed\nof three different modules: an entity neighbor aggregator module allowing to\nintegrate the information of the neighbors of an entity to capture different\nperspectives of it; a relation qualifier aggregator module to integrate\nhyper-relational knowledge into the corresponding relation to refine the\nrepresentation of relational content; a convolution-based bidirectional\ninteraction module based on a convolutional operation, capturing pairwise\nbidirectional interactions of entity-relation, entity-qualifier, and\nrelation-qualifier. realize the depth perception of the content related to the\ncurrent statement. Furthermore, we introduce a Mixture-of-Experts strategy into\nthe feed-forward layers of HyperFormer to strengthen its representation\ncapabilities while reducing the amount of model parameters and computation.\nExtensive experiments on three well-known datasets with four different\nconditions demonstrate HyperFormer's effectiveness. Datasets and code are\navailable at https://github.com/zhiweihu1103/HKGC-HyperFormer.\n","authors":["Zhiwei Hu","Víctor Gutiérrez-Basulto","Zhiliang Xiang","Ru Li","Jeff Z. Pan"],"pdf_url":"https://arxiv.org/pdf/2308.06512v1.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2308.06507v1","updated":"2023-08-12T08:52:40Z","published":"2023-08-12T08:52:40Z","title":"AutoConv: Automatically Generating Information-seeking Conversations\n  with Large Language Models","summary":"  Information-seeking conversation, which aims to help users gather information\nthrough conversation, has achieved great progress in recent years. However, the\nresearch is still stymied by the scarcity of training data. To alleviate this\nproblem, we propose AutoConv for synthetic conversation generation, which takes\nadvantage of the few-shot learning ability and generation capacity of large\nlanguage models (LLM). Specifically, we formulate the conversation generation\nproblem as a language modeling task, then finetune an LLM with a few human\nconversations to capture the characteristics of the information-seeking process\nand use it for generating synthetic conversations with high quality.\nExperimental results on two frequently-used datasets verify that AutoConv has\nsubstantial improvements over strong baselines and alleviates the dependence on\nhuman annotation. In addition, we also provide several analysis studies to\npromote future research.\n","authors":["Siheng Li","Cheng Yang","Yichun Yin","Xinyu Zhu","Zesen Cheng","Lifeng Shang","Xin Jiang","Qun Liu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06507v1.pdf","comment":"Accepted to ACL 2023 Main Conference (Short)"},{"id":"http://arxiv.org/abs/2308.06502v1","updated":"2023-08-12T08:34:15Z","published":"2023-08-12T08:34:15Z","title":"Three Ways of Using Large Language Models to Evaluate Chat","summary":"  This paper describes the systems submitted by team6 for ChatEval, the DSTC 11\nTrack 4 competition. We present three different approaches to predicting\nturn-level qualities of chatbot responses based on large language models\n(LLMs). We report improvement over the baseline using dynamic few-shot examples\nfrom a vector store for the prompts for ChatGPT. We also analyze the\nperformance of the other two approaches and report needed improvements for\nfuture work. We developed the three systems over just two weeks, showing the\npotential of LLMs for this task. An ablation study conducted after the\nchallenge deadline shows that the new Llama 2 models are closing the\nperformance gap between ChatGPT and open-source LLMs. However, we find that the\nLlama 2 models do not benefit from few-shot examples in the same way as\nChatGPT.\n","authors":["Ondřej Plátek","Vojtěch Hudeček","Patricia Schmidtová","Mateusz Lango","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2308.06502v1.pdf","comment":"Accepted to DSTC11 workshop https://dstc11.dstc.community/"},{"id":"http://arxiv.org/abs/2308.06501v1","updated":"2023-08-12T08:33:42Z","published":"2023-08-12T08:33:42Z","title":"NewsDialogues: Towards Proactive News Grounded Conversation","summary":"  Hot news is one of the most popular topics in daily conversations. However,\nnews grounded conversation has long been stymied by the lack of well-designed\ntask definition and scarce data. In this paper, we propose a novel task,\nProactive News Grounded Conversation, in which a dialogue system can\nproactively lead the conversation based on some key topics of the news. In\naddition, both information-seeking and chit-chat scenarios are included\nrealistically, where the user may ask a series of questions about the news\ndetails or express their opinions and be eager to chat. To further develop this\nnovel task, we collect a human-to-human Chinese dialogue dataset\n\\ts{NewsDialogues}, which includes 1K conversations with a total of 14.6K\nutterances and detailed annotations for target topics and knowledge spans.\nFurthermore, we propose a method named Predict-Generate-Rank, consisting of a\ngenerator for grounded knowledge prediction and response generation, and a\nranker for the ranking of multiple responses to alleviate the exposure bias. We\nconduct comprehensive experiments to demonstrate the effectiveness of the\nproposed method and further present several key findings and challenges to\nprompt future research.\n","authors":["Siheng Li","Yichun Yin","Cheng Yang","Wangjie Jiang","Yiwei Li","Zesen Cheng","Lifeng Shang","Xin Jiang","Qun Liu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06501v1.pdf","comment":"Accepted to ACL 2023 Conference (Long Paper; Findings)"},{"id":"http://arxiv.org/abs/2308.06488v1","updated":"2023-08-12T07:12:45Z","published":"2023-08-12T07:12:45Z","title":"Generating Faithful Text From a Knowledge Graph with Noisy Reference\n  Text","summary":"  Knowledge Graph (KG)-to-Text generation aims at generating fluent\nnatural-language text that accurately represents the information of a given\nknowledge graph. While significant progress has been made in this task by\nexploiting the power of pre-trained language models (PLMs) with appropriate\ngraph structure-aware modules, existing models still fall short of generating\nfaithful text, especially when the ground-truth natural-language text contains\nadditional information that is not present in the graph. In this paper, we\ndevelop a KG-to-text generation model that can generate faithful\nnatural-language text from a given graph, in the presence of noisy reference\ntext. Our framework incorporates two core ideas: Firstly, we utilize\ncontrastive learning to enhance the model's ability to differentiate between\nfaithful and hallucinated information in the text, thereby encouraging the\ndecoder to generate text that aligns with the input graph. Secondly, we empower\nthe decoder to control the level of hallucination in the generated text by\nemploying a controllable text generation technique. We evaluate our model's\nperformance through the standard quantitative metrics as well as a\nChatGPT-based quantitative and qualitative analysis. Our evaluation\ndemonstrates the superior performance of our model over state-of-the-art\nKG-to-text models on faithfulness.\n","authors":["Tahsina Hashem","Weiqing Wang","Derry Tanti Wijaya","Mohammed Eunus Ali","Yuan-Fang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04424v2","updated":"2023-08-12T06:17:42Z","published":"2023-08-08T17:53:24Z","title":"A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment\n  Classification and Act Recognition","summary":"  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition\n(DAR) aims to predict the sentiment label and act label for each utterance in a\ndialog simultaneously. However, current methods encode the dialog context in\nonly one direction, which limits their ability to thoroughly comprehend the\ncontext. Moreover, these methods overlook the explicit correlations between\nsentiment and act labels, which leads to an insufficient ability to capture\nrich sentiment and act clues and hinders effective and accurate reasoning. To\naddress these issues, we propose a Bi-directional Multi-hop Inference Model\n(BMIM) that leverages a feature selection network and a bi-directional\nmulti-hop inference network to iteratively extract and integrate rich sentiment\nand act clues in a bi-directional manner. We also employ contrastive learning\nand dual learning to explicitly model the correlations of sentiment and act\nlabels. Our experiments on two widely-used datasets show that BMIM outperforms\nstate-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1\nscore in DSC. Additionally, Our proposed model not only improves the\nperformance but also enhances the interpretability of the joint sentiment and\nact prediction task.\n","authors":["Li Zheng","Fei Li","Yuyang Chai","Chong Teng","Donghong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.04424v2.pdf","comment":"Accepted by NLPCC 2023"},{"id":"http://arxiv.org/abs/2308.04498v2","updated":"2023-08-12T06:12:36Z","published":"2023-08-08T18:03:29Z","title":"DialogRE^C+: An Extension of DialogRE to Investigate How Much\n  Coreference Helps Relation Extraction in Dialogs","summary":"  Dialogue relation extraction (DRE) that identifies the relations between\nargument pairs in dialogue text, suffers much from the frequent occurrence of\npersonal pronouns, or entity and speaker coreference. This work introduces a\nnew benchmark dataset DialogRE^C+, introducing coreference resolution into the\nDRE scenario. With the aid of high-quality coreference knowledge, the reasoning\nof argument relations is expected to be enhanced. In DialogRE^C+ dataset, we\nmanually annotate total 5,068 coreference chains over 36,369 argument mentions\nbased on the existing DialogRE data, where four different coreference chain\ntypes namely speaker chain, person chain, location chain and organization chain\nare explicitly marked. We further develop 4 coreference-enhanced graph-based\nDRE models, which learn effective coreference representations for improving the\nDRE task. We also train a coreference resolution model based on our annotations\nand evaluate the effect of automatically extracted coreference chains\ndemonstrating the practicality of our dataset and its potential to other\ndomains and tasks.\n","authors":["Yiyun Xiong","Mengwei Dai","Fei Li","Hao Fei","Bobo Li","Shengqiong Wu","Donghong Ji","Chong Teng"],"pdf_url":"https://arxiv.org/pdf/2308.04498v2.pdf","comment":"Accepted by NLPCC 2023"},{"id":"http://arxiv.org/abs/2308.04502v2","updated":"2023-08-12T06:05:26Z","published":"2023-08-08T18:11:27Z","title":"Revisiting Disentanglement and Fusion on Modality and Context in\n  Conversational Multimodal Emotion Recognition","summary":"  It has been a hot research topic to enable machines to understand human\nemotions in multimodal contexts under dialogue scenarios, which is tasked with\nmultimodal emotion analysis in conversation (MM-ERC). MM-ERC has received\nconsistent attention in recent years, where a diverse range of methods has been\nproposed for securing better task performance. Most existing works treat MM-ERC\nas a standard multimodal classification problem and perform multimodal feature\ndisentanglement and fusion for maximizing feature utility. Yet after revisiting\nthe characteristic of MM-ERC, we argue that both the feature multimodality and\nconversational contextualization should be properly modeled simultaneously\nduring the feature disentanglement and fusion steps. In this work, we target\nfurther pushing the task performance by taking full consideration of the above\ninsights. On the one hand, during feature disentanglement, based on the\ncontrastive learning technique, we devise a Dual-level Disentanglement\nMechanism (DDM) to decouple the features into both the modality space and\nutterance space. On the other hand, during the feature fusion stage, we propose\na Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism\n(CRM) for multimodal and context integration, respectively. They together\nschedule the proper integrations of multimodal and context features.\nSpecifically, CFM explicitly manages the multimodal feature contributions\ndynamically, while CRM flexibly coordinates the introduction of dialogue\ncontexts. On two public MM-ERC datasets, our system achieves new\nstate-of-the-art performance consistently. Further analyses demonstrate that\nall our proposed mechanisms greatly facilitate the MM-ERC task by making full\nuse of the multimodal and context features adaptively. Note that our proposed\nmethods have the great potential to facilitate a broader range of other\nconversational multimodal tasks.\n","authors":["Bobo Li","Hao Fei","Lizi Liao","Yu Zhao","Chong Teng","Tat-Seng Chua","Donghong Ji","Fei Li"],"pdf_url":"https://arxiv.org/pdf/2308.04502v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.05081v2","updated":"2023-08-12T06:02:02Z","published":"2023-08-09T17:20:14Z","title":"Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic\n  Role Labeling","summary":"  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from\ngiven videos, by recognizing the predict-argument event structures and the\ninterrelationships between events. While recent endeavors have put forth\nmethods for VidSRL, they can be mostly subject to two key drawbacks, including\nthe lack of fine-grained spatial scene perception and the insufficiently\nmodeling of video temporality. Towards this end, this work explores a novel\nholistic spatio-temporal scene graph (namely HostSG) representation based on\nthe existing dynamic scene graph structures, which well model both the\nfine-grained spatial semantics and temporal dynamics of videos for VidSRL.\nBuilt upon the HostSG, we present a nichetargeting VidSRL framework. A\nscene-event mapping mechanism is first designed to bridge the gap between the\nunderlying scene structure and the high-level event semantic structure,\nresulting in an overall hierarchical scene-event (termed ICE) graph structure.\nWe further perform iterative structure refinement to optimize the ICE graph,\nsuch that the overall structure representation can best coincide with end task\ndemand. Finally, three subtask predictions of VidSRL are jointly decoded, where\nthe end-to-end paradigm effectively avoids error propagation. On the benchmark\ndataset, our framework boosts significantly over the current best-performing\nmodel. Further analyses are shown for a better understanding of the advances of\nour methods.\n","authors":["Yu Zhao","Hao Fei","Yixin Cao","Bobo Li","Meishan Zhang","Jianguo Wei","Min Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.05081v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06463v1","updated":"2023-08-12T04:05:57Z","published":"2023-08-12T04:05:57Z","title":"GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher","summary":"  Safety lies at the core of the development of Large Language Models (LLMs).\nThere is ample work on aligning LLMs with human ethics and preferences,\nincluding data filtering in pretraining, supervised fine-tuning, reinforcement\nlearning from human feedback, and red teaming, etc. In this study, we discover\nthat chat in cipher can bypass the safety alignment techniques of LLMs, which\nare mainly conducted in natural languages. We propose a novel framework\nCipherChat to systematically examine the generalizability of safety alignment\nto non-natural languages -- ciphers. CipherChat enables humans to chat with\nLLMs through cipher prompts topped with system role descriptions and few-shot\nenciphered demonstrations. We use CipherChat to assess state-of-the-art LLMs,\nincluding ChatGPT and GPT-4 for different representative human ciphers across\n11 safety domains in both English and Chinese. Experimental results show that\ncertain ciphers succeed almost 100% of the time to bypass the safety alignment\nof GPT-4 in several safety domains, demonstrating the necessity of developing\nsafety alignment for non-natural languages. Notably, we identify that LLMs seem\nto have a ''secret cipher'', and propose a novel SelfCipher that uses only role\nplay and several demonstrations in natural language to evoke this capability.\nSelfCipher surprisingly outperforms existing human ciphers in almost all cases.\nOur code and data will be released at https://github.com/RobustNLP/CipherChat.\n","authors":["Youliang Yuan","Wenxiang Jiao","Wenxuan Wang","Jen-tse Huang","Pinjia He","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2308.06463v1.pdf","comment":"13 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.06457v1","updated":"2023-08-12T03:30:49Z","published":"2023-08-12T03:30:49Z","title":"Text-to-Video: a Two-stage Framework for Zero-shot Identity-agnostic\n  Talking-head Generation","summary":"  The advent of ChatGPT has introduced innovative methods for information\ngathering and analysis. However, the information provided by ChatGPT is limited\nto text, and the visualization of this information remains constrained.\nPrevious research has explored zero-shot text-to-video (TTV) approaches to\ntransform text into videos. However, these methods lacked control over the\nidentity of the generated audio, i.e., not identity-agnostic, hindering their\neffectiveness. To address this limitation, we propose a novel two-stage\nframework for person-agnostic video cloning, specifically focusing on TTV\ngeneration. In the first stage, we leverage pretrained zero-shot models to\nachieve text-to-speech (TTS) conversion. In the second stage, an audio-driven\ntalking head generation method is employed to produce compelling videos\nprivided the audio generated in the first stage. This paper presents a\ncomparative analysis of different TTS and audio-driven talking head generation\nmethods, identifying the most promising approach for future research and\ndevelopment. Some audio and videos samples can be found in the following link:\nhttps://github.com/ZhichaoWang970201/Text-to-Video/tree/main.\n","authors":["Zhichao Wang","Mengyu Dai","Keld Lundgaard"],"pdf_url":"https://arxiv.org/pdf/2308.06457v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2308.06454v1","updated":"2023-08-12T03:23:09Z","published":"2023-08-12T03:23:09Z","title":"Demonstration-based learning for few-shot biomedical named entity\n  recognition under machine reading comprehension","summary":"  Although deep learning techniques have shown significant achievements, they\nfrequently depend on extensive amounts of hand-labeled data and tend to perform\ninadequately in few-shot scenarios. The objective of this study is to devise a\nstrategy that can improve the model's capability to recognize biomedical\nentities in scenarios of few-shot learning. By redefining biomedical named\nentity recognition (BioNER) as a machine reading comprehension (MRC) problem,\nwe propose a demonstration-based learning method to address few-shot BioNER,\nwhich involves constructing appropriate task demonstrations. In assessing our\nproposed method, we compared the proposed method with existing advanced methods\nusing six benchmark datasets, including BC4CHEMD, BC5CDR-Chemical,\nBC5CDR-Disease, NCBI-Disease, BC2GM, and JNLPBA. We examined the models'\nefficacy by reporting F1 scores from both the 25-shot and 50-shot learning\nexperiments. In 25-shot learning, we observed 1.1% improvements in the average\nF1 scores compared to the baseline method, reaching 61.7%, 84.1%, 69.1%, 70.1%,\n50.6%, and 59.9% on six datasets, respectively. In 50-shot learning, we further\nimproved the average F1 scores by 1.0% compared to the baseline method,\nreaching 73.1%, 86.8%, 76.1%, 75.6%, 61.7%, and 65.4%, respectively. We\nreported that in the realm of few-shot learning BioNER, MRC-based language\nmodels are much more proficient in recognizing biomedical entities compared to\nthe sequence labeling approach. Furthermore, our MRC-language models can\ncompete successfully with fully-supervised learning methodologies that rely\nheavily on the availability of abundant annotated data. These results highlight\npossible pathways for future advancements in few-shot BioNER methodologies.\n","authors":["Leilei Su","Jian Chen","Yifan Peng","Cong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06450v1","updated":"2023-08-12T03:05:44Z","published":"2023-08-12T03:05:44Z","title":"Simple Model Also Works: A Novel Emotion Recognition Network in Textual\n  Conversation Based on Curriculum Learning Strategy","summary":"  Emotion Recognition in Conversation (ERC) has emerged as a research hotspot\nin domains such as conversational robots and question-answer systems. How to\nefficiently and adequately retrieve contextual emotional cues has been one of\nthe key challenges in the ERC task. Existing efforts do not fully model the\ncontext and employ complex network structures, resulting in excessive\ncomputational resource overhead without substantial performance improvement. In\nthis paper, we propose a novel Emotion Recognition Network based on Curriculum\nLearning strategy (ERNetCL). The proposed ERNetCL primarily consists of\nTemporal Encoder (TE), Spatial Encoder (SE), and Curriculum Learning (CL) loss.\nWe utilize TE and SE to combine the strengths of previous methods in a\nsimplistic manner to efficiently capture temporal and spatial contextual\ninformation in the conversation. To simulate the way humans learn curriculum\nfrom easy to hard, we apply the idea of CL to the ERC task to progressively\noptimize the network parameters of ERNetCL. At the beginning of training, we\nassign lower learning weights to difficult samples. As the epoch increases, the\nlearning weights for these samples are gradually raised. Extensive experiments\non four datasets exhibit that our proposed method is effective and dramatically\nbeats other baseline models.\n","authors":["Jiang Li","Xiaoping Wang","Yingjian Liu","Qing Zhou","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06450v1.pdf","comment":"12 pages,9 figures"},{"id":"http://arxiv.org/abs/2302.12324v3","updated":"2023-08-12T03:00:55Z","published":"2023-02-23T20:39:06Z","title":"Summaries as Captions: Generating Figure Captions for Scientific\n  Documents with Automated Text Summarization","summary":"  Good figure captions help paper readers understand complex scientific\nfigures. Unfortunately, even published papers often have poorly written\ncaptions. Automatic caption generation could aid paper writers by providing\ngood starting captions that can be refined for better quality. Prior work often\ntreated figure caption generation as a vision-to-language task. In this paper,\nwe show that it can be more effectively tackled as a text summarization task in\nscientific documents. We fine-tuned PEGASUS, a pre-trained abstractive\nsummarization model, to specifically summarize figure-referencing paragraphs\n(e.g., \"Figure 3 shows...\") into figure captions. Experiments on large-scale\narXiv figures show that our method outperforms prior vision methods in both\nautomatic and human evaluations. We further conducted an in-depth investigation\nfocused on two key challenges: (i) the common presence of low-quality\nauthor-written captions and (ii) the lack of clear standards for good captions.\nOur code and data are available at:\nhttps://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task.\n","authors":["Chieh-Yang Huang","Ting-Yao Hsu","Ryan Rossi","Ani Nenkova","Sungchul Kim","Gromit Yeuk-Yin Chan","Eunyee Koh","Clyde Lee Giles","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2302.12324v3.pdf","comment":"Accepted by INLG-2023"},{"id":"http://arxiv.org/abs/2308.06431v1","updated":"2023-08-12T01:34:41Z","published":"2023-08-12T01:34:41Z","title":"Performance Prediction for Multi-hop Questions","summary":"  We study the problem of Query Performance Prediction (QPP) for open-domain\nmulti-hop Question Answering (QA), where the task is to estimate the difficulty\nof evaluating a multi-hop question over a corpus. Despite the extensive\nresearch on predicting the performance of ad-hoc and QA retrieval models, there\nhas been a lack of study on the estimation of the difficulty of multi-hop\nquestions. The problem is challenging due to the multi-step nature of the\nretrieval process, potential dependency of the steps and the reasoning\ninvolved. To tackle this challenge, we propose multHP, a novel pre-retrieval\nmethod for predicting the performance of open-domain multi-hop questions. Our\nextensive evaluation on the largest multi-hop QA dataset using several modern\nQA systems shows that the proposed model is a strong predictor of the\nperformance, outperforming traditional single-hop QPP models. Additionally, we\ndemonstrate that our approach can be effectively used to optimize the\nparameters of QA systems, such as the number of documents to be retrieved,\nresulting in improved overall retrieval performance.\n","authors":["Mohammadreza Samadi","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2308.06431v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.04711v2","updated":"2023-08-12T00:02:15Z","published":"2023-08-09T05:06:39Z","title":"Answering Unseen Questions With Smaller Language Models Using Rationale\n  Generation and Dense Retrieval","summary":"  When provided with sufficient explanatory context, smaller Language Models\nhave been shown to exhibit strong reasoning ability on challenging short-answer\nquestion-answering tasks where the questions are unseen in training. We\nevaluate two methods for further improvement in this setting. Both methods\nfocus on combining rationales generated by a larger Language Model with longer\ncontexts created from a multi-hop dense retrieval system. The first method\n($\\textit{RR}$) involves training a Rationale Ranking model to score both\ngenerated rationales and retrieved contexts with respect to relevance and\ntruthfulness. We then use the scores to derive combined contexts from both\nknowledge sources using a number of combinatory strategies. For the second\nmethod ($\\textit{RATD}$) we train a smaller Reasoning model using\nretrieval-augmented training datasets such that it becomes proficient at\nutilising relevant information from longer text sequences that may be only\npartially evidential and frequently contain many irrelevant sentences.\nGenerally we find that both methods are effective but that the $\\textit{RATD}$\nmethod is more straightforward to apply and produces the strongest results in\nthe unseen setting on which we focus. Our single best Reasoning model using\nonly 440 million parameters materially improves upon strong comparable prior\nbaselines for unseen evaluation datasets (StrategyQA 58.9 $\\rightarrow$ 61.7\nacc., CommonsenseQA 63.6 $\\rightarrow$ 72.7 acc., ARC-DA 31.6 $\\rightarrow$\n52.1 F1, IIRC 25.5 $\\rightarrow$ 27.3 F1) and a version utilising our prior\nknowledge of each type of question in selecting a context combination strategy\ndoes even better. Our proposed models also generally outperform direct prompts\nagainst much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot\nchain-of-thought and few-shot answer-only settings.\n","authors":["Tim Hartill","Diana Benavides-Prado","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.04711v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.06556v1","updated":"2023-08-12T12:58:39Z","published":"2023-08-12T12:58:39Z","title":"Contrastive Learning for Cross-modal Artist Retrieval","summary":"  Music retrieval and recommendation applications often rely on content\nfeatures encoded as embeddings, which provide vector representations of items\nin a music dataset. Numerous complementary embeddings can be derived from\nprocessing items originally represented in several modalities, e.g., audio\nsignals, user interaction data, or editorial data. However, data of any given\nmodality might not be available for all items in any music dataset. In this\nwork, we propose a method based on contrastive learning to combine embeddings\nfrom multiple modalities and explore the impact of the presence or absence of\nembeddings from diverse modalities in an artist similarity task. Experiments on\ntwo datasets suggest that our contrastive method outperforms single-modality\nembeddings and baseline algorithms for combining modalities, both in terms of\nartist retrieval accuracy and coverage. Improvements with respect to other\nmethods are particularly significant for less popular query artists. We\ndemonstrate our method successfully combines complementary information from\ndiverse modalities, and is more robust to missing modality data (i.e., it\nbetter handles the retrieval of artists with different modality embeddings than\nthe query artist's).\n","authors":["Andres Ferraro","Jaehun Kim","Sergio Oramas","Andreas Ehmann","Fabien Gouyon"],"pdf_url":"https://arxiv.org/pdf/2308.06556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06480v1","updated":"2023-08-12T06:23:41Z","published":"2023-08-12T06:23:41Z","title":"Context-aware Event Forecasting via Graph Disentanglement","summary":"  Event forecasting has been a demanding and challenging task throughout the\nentire human history. It plays a pivotal role in crisis alarming and disaster\nprevention in various aspects of the whole society. The task of event\nforecasting aims to model the relational and temporal patterns based on\nhistorical events and makes forecasting to what will happen in the future. Most\nexisting studies on event forecasting formulate it as a problem of link\nprediction on temporal event graphs. However, such pure structured formulation\nsuffers from two main limitations: 1) most events fall into general and\nhigh-level types in the event ontology, and therefore they tend to be\ncoarse-grained and offers little utility which inevitably harms the forecasting\naccuracy; and 2) the events defined by a fixed ontology are unable to retain\nthe out-of-ontology contextual information. To address these limitations, we\npropose a novel task of context-aware event forecasting which incorporates\nauxiliary contextual information. First, the categorical context provides\nsupplementary fine-grained information to the coarse-grained events. Second and\nmore importantly, the context provides additional information towards specific\nsituation and condition, which is crucial or even determinant to what will\nhappen next. However, it is challenging to properly integrate context into the\nevent forecasting framework, considering the complex patterns in the\nmulti-context scenario. Towards this end, we design a novel framework named\nSeparation and Collaboration Graph Disentanglement (short as SeCoGD) for\ncontext-aware event forecasting. Since there is no available dataset for this\nnovel task, we construct three large-scale datasets based on GDELT.\nExperimental results demonstrate that our model outperforms a list of SOTA\nmethods.\n","authors":["Yunshan Ma","Chenchen Ye","Zijian Wu","Xiang Wang","Yixin Cao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.06480v1.pdf","comment":"KDD 2023, 9 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2302.09971v3","updated":"2023-08-12T02:28:40Z","published":"2023-02-20T13:30:12Z","title":"Social4Rec: Distilling User Preference from Social Graph for Video\n  Recommendation in Tencent","summary":"  Despite recommender systems play a key role in network content platforms,\nmining the user's interests is still a significant challenge. Existing works\npredict the user interest by utilizing user behaviors, i.e., clicks, views,\netc., but current solutions are ineffective when users perform unsettled\nactivities. The latter ones involve new users, which have few activities of any\nkind, and sparse users who have low-frequency behaviors. We uniformly describe\nboth these user-types as \"cold users\", which are very common but often\nneglected in network content platforms. To address this issue, we enhance the\nrepresentation of the user interest by combining his social interest, e.g.,\nfriendship, following bloggers, interest groups, etc., with the activity\nbehaviors. Thus, in this work, we present a novel algorithm entitled SocialNet,\nwhich adopts a two-stage method to progressively extract the coarse-grained and\nfine-grained social interest. Our technique then concatenates SocialNet's\noutput with the original user representation to get the final user\nrepresentation that combines behavior interests and social interests. Offline\nexperiments on Tencent video's recommender system demonstrate the superiority\nover the baseline behavior-based model. The online experiment also shows a\nsignificant performance improvement in clicks and view time in the real-world\nrecommendation system. The source code is available at\nhttps://github.com/Social4Rec/SocialNet.\n","authors":["Xuanji Xiao","Huaqiang Dai","Qian Dong","Shuzi Niu","Yuzhen Liu","Pei Liu"],"pdf_url":"https://arxiv.org/pdf/2302.09971v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06431v1","updated":"2023-08-12T01:34:41Z","published":"2023-08-12T01:34:41Z","title":"Performance Prediction for Multi-hop Questions","summary":"  We study the problem of Query Performance Prediction (QPP) for open-domain\nmulti-hop Question Answering (QA), where the task is to estimate the difficulty\nof evaluating a multi-hop question over a corpus. Despite the extensive\nresearch on predicting the performance of ad-hoc and QA retrieval models, there\nhas been a lack of study on the estimation of the difficulty of multi-hop\nquestions. The problem is challenging due to the multi-step nature of the\nretrieval process, potential dependency of the steps and the reasoning\ninvolved. To tackle this challenge, we propose multHP, a novel pre-retrieval\nmethod for predicting the performance of open-domain multi-hop questions. Our\nextensive evaluation on the largest multi-hop QA dataset using several modern\nQA systems shows that the proposed model is a strong predictor of the\nperformance, outperforming traditional single-hop QPP models. Additionally, we\ndemonstrate that our approach can be effectively used to optimize the\nparameters of QA systems, such as the number of documents to be retrieved,\nresulting in improved overall retrieval performance.\n","authors":["Mohammadreza Samadi","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2308.06431v1.pdf","comment":"10 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.06464v1","updated":"2023-08-12T04:51:04Z","published":"2023-08-12T04:51:04Z","title":"A One-dimensional HEVC video steganalysis method using the Optimality of\n  Predicted Motion Vectors","summary":"  Among steganalysis techniques, detection against motion vector (MV)\ndomain-based video steganography in High Efficiency Video Coding (HEVC)\nstandard remains a hot and challenging issue. For the purpose of improving the\ndetection performance, this paper proposes a steganalysis feature based on the\noptimality of predicted MVs with a dimension of one. Firstly, we point out that\nthe motion vector prediction (MVP) of the prediction unit (PU) encoded using\nthe Advanced Motion Vector Prediction (AMVP) technique satisfies the local\noptimality in the cover video. Secondly, we analyze that in HEVC video, message\nembedding either using MVP index or motion vector differences (MVD) may destroy\nthe above optimality of MVP. And then, we define the optimal rate of MVP in\nHEVC video as a steganalysis feature. Finally, we conduct steganalysis\ndetection experiments on two general datasets for three popular steganography\nmethods and compare the performance with four state-of-the-art steganalysis\nmethods. The experimental results show that the proposed optimal rate of MVP\nfor all cover videos is 100\\%, while the optimal rate of MVP for all stego\nvideos is less than 100\\%. Therefore, the proposed steganography scheme can\naccurately distinguish between cover videos and stego videos, and it is\nefficiently applied to practical scenarios with no model training and low\ncomputational complexity.\n","authors":["Jun Li","Minqing Zhang","Ke Niu","Yingnan Zhang","Xiaoyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06464v1.pdf","comment":"Submitted to TCSVT"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 00000000..7f5166c7
Binary files /dev/null and b/favicon.ico differ
diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..ce82a73e
--- /dev/null
+++ b/index.html
@@ -0,0 +1,78366 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-14T00:00:00Z">2023-08-14</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">35</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Platypus: Quick, Cheap, and Powerful Refinement of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel N. Lee, Cole J. Hunter, Nataniel Ruiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present $\textbf{Platypus}$, a family of fine-tuned and merged Large
+Language Models (LLMs) that achieves the strongest performance and currently
+stands at first place in HuggingFace's Open LLM Leaderboard as of the release
+date of this work. In this work we describe (1) our curated dataset
+$\textbf{Open-Platypus}$, that is a subset of other open datasets and which
+$\textit{we release to the public}$ (2) our process of fine-tuning and merging
+LoRA modules in order to conserve the strong prior of pretrained LLMs, while
+bringing specific domain knowledge to the surface (3) our efforts in checking
+for test data leaks and contamination in the training data, which can inform
+future research. Specifically, the Platypus family achieves strong performance
+in quantitative LLM metrics across model sizes, topping the global Open LLM
+leaderboard while using just a fraction of the fine-tuning data and overall
+compute that are required for other state-of-the-art fine-tuned LLMs. In
+particular, a 13B Platypus model can be trained on $\textit{a single}$ A100 GPU
+using 25k questions in 5 hours. This is a testament of the quality of our
+Open-Platypus dataset, and opens opportunities for more improvements in the
+field. Project page: https://platypus-llm.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Helbling, Mansi Phute, Matthew Hull, Duen Horng Chau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have skyrocketed in popularity in recent years
+due to their ability to generate high-quality text in response to human
+prompting. However, these models have been shown to have the potential to
+generate harmful content in response to user prompting (e.g., giving users
+instructions on how to commit crimes). There has been a focus in the literature
+on mitigating these risks, through methods like aligning models with human
+values through reinforcement learning. However, it has been shown that even
+aligned language models are susceptible to adversarial attacks that bypass
+their restrictions on generating harmful text. We propose a simple approach to
+defending against these attacks by having a large language model filter its own
+responses. Our current results show that even if a model is not fine-tuned to
+be aligned with human values, it is possible to stop it from presenting harmful
+content to users by validating the content using a language model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Authorship Attribution: Stylometric Analysis on Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tharindu Kumarage, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4, PaLM, and Llama have
+significantly propelled the generation of AI-crafted text. With rising concerns
+about their potential misuse, there is a pressing need for AI-generated-text
+forensics. Neural authorship attribution is a forensic effort, seeking to trace
+AI-generated text back to its originating LLM. The LLM landscape can be divided
+into two primary categories: proprietary and open-source. In this work, we
+delve into these emerging categories of LLMs, focusing on the nuances of neural
+authorship attribution. To enrich our understanding, we carry out an empirical
+analysis of LLM writing signatures, highlighting the contrasts between
+proprietary and open-source models, and scrutinizing variations within each
+group. By integrating stylometric features across lexical, syntactic, and
+structural aspects of language, we explore their potential to yield
+interpretable results and augment pre-trained language model-based classifiers
+utilized in neural authorship attribution. Our findings, based on a range of
+state-of-the-art LLMs, provide empirical insights into neural authorship
+attribution, paving the way for future investigations aimed at mitigating the
+threats posed by AI-generated misinformation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in the Errors: Leveraging Large Language Models for
+  Fine-grained Machine Translation Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Fernandes, Daniel Deutsch, Mara Finkelstein, Parker Riley, André F. T. Martins, Graham Neubig, Ankush Garg, Jonathan H. Clark, Markus Freitag, Orhan Firat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation of machine translation (MT) is a critical tool driving
+the rapid iterative development of MT systems. While considerable progress has
+been made on estimating a single scalar quality score, current metrics lack the
+informativeness of more detailed schemes that annotate individual errors, such
+as Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap
+by proposing AutoMQM, a prompting technique which leverages the reasoning and
+in-context learning capabilities of large language models (LLMs) and asks them
+to identify and categorize errors in translations. We start by evaluating
+recent LLMs, such as PaLM and PaLM-2, through simple score prediction
+prompting, and we study the impact of labeled data through in-context learning
+and finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that
+it improves performance compared to just prompting for scores (with
+particularly large gains for larger models) while providing interpretability
+through error spans that align with human annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparison between parameter-efficient techniques and full fine-tuning:
+  A case study on multilingual news article classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olesya Razuvayevskaya, Ben Wu, Joao A. Leite, Freddy Heppell, Ivan Srba, Carolina Scarton, Kalina Bontcheva, Xingyi Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapters and Low-Rank Adaptation (LoRA) are parameter-efficient fine-tuning
+techniques designed to make the training of language models more efficient.
+Previous results demonstrated that these methods can even improve performance
+on some classification tasks. This paper complements the existing research by
+investigating how these techniques influence the classification performance and
+computation costs compared to full fine-tuning when applied to multilingual
+text classification tasks (genre, framing, and persuasion techniques detection;
+with different input lengths, number of predicted classes and classification
+difficulty), some of which have limited training data. In addition, we conduct
+in-depth analyses of their efficacy across different training scenarios
+(training on the original multilingual data; on the translations into English;
+and on a subset of English-only data) and different languages. Our findings
+provide valuable insights into the applicability of the parameter-efficient
+fine-tuning techniques, particularly to complex multilingual and multilabel
+classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialogue for <span class="highlight-title">Prompt</span>ing: a Policy-Gradient-Based Discrete <span class="highlight-title">Prompt</span>
+  Optimization for Few-shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzhengxu Li, Xiaoming Liu, Yichen Wang, Duyi Li, Yu Lan, Chao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt-based pre-trained language models (PLMs) paradigm have succeeded
+substantially in few-shot natural language processing (NLP) tasks. However,
+prior discrete prompt optimization methods require expert knowledge to design
+the base prompt set and identify high-quality prompts, which is costly,
+inefficient, and subjective. Meanwhile, existing continuous prompt optimization
+methods improve the performance by learning the ideal prompts through the
+gradient information of PLMs, whose high computational cost, and low
+readability and generalizability are often concerning. To address the research
+gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt
+Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment
+strategy for readability prompt set generation based on GPT-4. Furthermore, we
+propose an efficient prompt screening metric to identify high-quality prompts
+with linear complexity. Finally, we construct a reinforcement learning (RL)
+framework based on policy gradients to match the prompts to inputs optimally.
+By training a policy network with only 0.67% of the PLM parameter size on the
+tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)
+method by 1.52% in accuracy on average on four open-source datasets. Moreover,
+subsequent experiments also demonstrate that $DP_2O$ has good universality,
+robustness, and generalization ability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Ningyu Zhang, Xin Xie, Yunzhi Yao, Bozhong Tian, Mengru Wang, Zekun Xi, Siyuan Cheng, Kangwei Liu, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy
+issues, which means they are unaware of unseen events or generate text with
+incorrect facts owing to the outdated/noisy data. To this end, many knowledge
+editing approaches for LLMs have emerged -- aiming to subtly inject/edit
+updated knowledge or adjust undesired behavior while minimizing the impact on
+unrelated inputs. Nevertheless, due to significant differences among various
+knowledge editing methods and the variations in task setups, there is no
+standard implementation framework available for the community, which hinders
+practitioners to apply knowledge editing to applications. To address these
+issues, we propose EasyEdit, an easy-to-use knowledge editing framework for
+LLMs. It supports various cutting-edge knowledge editing approaches and can be
+readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.
+Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,
+demonstrating that knowledge editing surpasses traditional fine-tuning in terms
+of reliability and generalization. We have released the source code on GitHub
+at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and
+comprehensive documentation for beginners to get started. Besides, we present
+an online system for real-time knowledge editing, and a demo video at
+http://knowlm.zjukg.cn/easyedit.mp4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website is https://github.com/zjunlp/EasyEdit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-Min Chan, Weize Chen, Yusheng Su, Jianxuan Yu, Wei Xue, Shanghang Zhang, Jie Fu, Zhiyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text evaluation has historically posed significant challenges, often
+demanding substantial labor and time cost. With the emergence of large language
+models (LLMs), researchers have explored LLMs' potential as alternatives for
+human evaluation. While these single-agent-based approaches show promise,
+experimental results suggest that further advancements are needed to bridge the
+gap between their current effectiveness and human-level evaluation quality.
+Recognizing that best practices of human evaluation processes often involve
+multiple human annotators collaborating in the evaluation, we resort to a
+multi-agent debate framework, moving beyond single-agent prompting strategies.
+The multi-agent-based approach enables a group of LLMs to synergize with an
+array of intelligent counterparts, harnessing their distinct capabilities and
+expertise to enhance efficiency and effectiveness in handling intricate tasks.
+In this paper, we construct a multi-agent referee team called ChatEval to
+autonomously discuss and evaluate the quality of generated responses from
+different models on open-ended questions and traditional natural language
+generation (NLG) tasks. Our analysis shows that ChatEval transcends mere
+textual scoring, offering a human-mimicking evaluation process for reliable
+assessments. Our code is available at https://github.com/chanchimin/ChatEval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Annotator Uncertainty into Representations of Discourse
+  Relations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. Magalí López Cortez, Cassandra L. Jacobs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotation of discourse relations is a known difficult task, especially for
+non-expert annotators. In this paper, we investigate novice annotators'
+uncertainty on the annotation of discourse relations on spoken conversational
+data. We find that dialogue context (single turn, pair of turns within speaker,
+and pair of turns across speakers) is a significant predictor of confidence
+scores. We compute distributed representations of discourse relations from
+co-occurrence statistics that incorporate information about confidence scores
+and dialogue context. We perform a hierarchical clustering analysis using these
+representations and show that weighting discourse relation representations with
+information about confidence and dialogue context coherently models our
+annotators' uncertainty about discourse relation labels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OctoPack: Instruction Tuning Code Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro von Werra, Shayne Longpre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finetuning large language models (LLMs) on instructions leads to vast
+performance improvements on natural language tasks. We apply instruction tuning
+using code, leveraging the natural structure of Git commits, which pair code
+changes with human instructions. We compile CommitPack: 4 terabytes of Git
+commits across 350 programming languages. We benchmark CommitPack against other
+natural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B
+parameter StarCoder model, and achieve state-of-the-art performance among
+models not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2%
+pass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark
+to a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis)
+across 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models,
+OctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among
+all permissive models, demonstrating CommitPack's benefits in generalizing to a
+wider set of languages and natural coding tasks. Code, models and data are
+freely available at https://github.com/bigcode-project/octopack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>57 pages (9 main), 39 figures, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Natural Language is All a Graph Needs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruosong Ye, Caiqi Zhang, Runhui Wang, Shuyuan Xu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large-scale pre-trained language models, such as ChatGPT,
+has revolutionized various research fields in artificial intelligence.
+Transformers-based large language models (LLMs) have gradually replaced CNNs
+and RNNs to unify fields of computer vision and natural language processing.
+Compared with the data that exists relatively independently such as images,
+videos or texts, graph is a type of data that contains rich structural and
+relational information. Meanwhile, natural language, as one of the most
+expressive mediums, excels in describing complex structures. However, existing
+work on incorporating graph learning problems into the generative language
+modeling framework remains very limited. As the importance of language models
+continues to grow, it becomes essential to explore whether LLMs can also
+replace GNNs as the foundational model for graphs. In this paper, we propose
+InstructGLM (Instruction-finetuned Graph Language Model), systematically design
+highly scalable prompts based on natural language instructions, and use natural
+language to describe the geometric structure and node features of the graph for
+instruction tuning an LLMs to perform learning and inference on graphs in a
+generative manner. Our method exceeds all competitive GNN baselines on
+ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of
+our method and sheds light on generative language models replacing GNNs as the
+foundation model for graph machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mind your Language (Model): Fact-Checking LLMs and their Role in NLP
+  Research and Practice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandra Sasha Luccioni, Anna Rogers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much of the recent discourse within the NLP research community has been
+centered around Large Language Models (LLMs), their functionality and potential
+-- yet not only do we not have a working definition of LLMs, but much of this
+discourse relies on claims and assumptions that are worth re-examining. This
+position paper contributes a definition of LLMs, explicates some of the
+assumptions made regarding their functionality, and outlines the existing
+evidence for and against them. We conclude with suggestions for research
+directions and their framing in future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Information Retrieval: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a primary means of information acquisition, information retrieval (IR)
+systems, such as search engines, have integrated themselves into our daily
+lives. These systems also serve as components of dialogue, question-answering,
+and recommender systems. The trajectory of IR has evolved dynamically from its
+origins in term-based methods to its integration with advanced neural models.
+While the neural models excel at capturing complex contextual signals and
+semantic nuances, thereby reshaping the IR landscape, they still face
+challenges such as data scarcity, interpretability, and the generation of
+contextually plausible yet potentially inaccurate responses. This evolution
+requires a combination of both traditional methods (such as term-based sparse
+retrieval methods with rapid response) and modern neural architectures (such as
+language models with powerful language understanding capacity). Meanwhile, the
+emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has
+revolutionized natural language processing due to their remarkable language
+understanding, generation, generalization, and reasoning abilities.
+Consequently, recent research has sought to leverage LLMs to improve IR
+systems. Given the rapid evolution of this research trajectory, it is necessary
+to consolidate existing methodologies and provide nuanced insights through a
+comprehensive overview. In this survey, we delve into the confluence of LLMs
+and IR systems, including crucial aspects such as query rewriters, retrievers,
+rerankers, and readers. Additionally, we explore promising directions within
+this expanding field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Sentence Grounding in Streaming Videos <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Gan, Xiao Wang, Yan Sun, Jianlong Wu, Qingpei Guo, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to tackle a novel task - Temporal Sentence Grounding in
+Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance
+between a video stream and a given sentence query. Unlike regular videos,
+streaming videos are acquired continuously from a particular source, and are
+always desired to be processed on-the-fly in many applications such as
+surveillance and live-stream analysis. Thus, TSGSV is challenging since it
+requires the model to infer without future frames and process long historical
+frames effectively, which is untouched in the early methods. To specifically
+address the above challenges, we propose two novel methods: (1) a TwinNet
+structure that enables the model to learn about upcoming events; and (2) a
+language-guided feature compressor that eliminates redundant visual frames and
+reinforces the frames that are relevant to the query. We conduct extensive
+experiments using ActivityNet Captions, TACoS, and MAD datasets. The results
+demonstrate the superiority of our proposed methods. A systematic ablation
+study also confirms their effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aesthetics of Sanskrit Poetry from the Perspective of Computational
+  Linguistics: A Case Study Analysis on Siksastaka 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jivnesh Sandhan, Amruta Barbadikar, Malay Maity, Pavankumar Satuluri, Tushar Sandhan, Ravi M. Gupta, Pawan Goyal, Laxmidhar Behera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sanskrit poetry has played a significant role in shaping the literary and
+cultural landscape of the Indian subcontinent for centuries. However, not much
+attention has been devoted to uncovering the hidden beauty of Sanskrit poetry
+in computational linguistics. This article explores the intersection of
+Sanskrit poetry and computational linguistics by proposing a roadmap of an
+interpretable framework to analyze and classify the qualities and
+characteristics of fine Sanskrit poetry. We discuss the rich tradition of
+Sanskrit poetry and the significance of computational linguistics in
+automatically identifying the characteristics of fine poetry. The proposed
+framework involves a human-in-the-loop approach that combines deterministic
+aspects delegated to machines and deep semantics left to human experts. We
+provide a deep analysis of Siksastaka, a Sanskrit poem, from the perspective of
+6 prominent kavyashastra schools, to illustrate the proposed framework.
+Additionally, we provide compound, dependency, anvaya (prose order linearised
+form), meter, rasa (mood), alankar (figure of speech), and riti (writing style)
+annotations for Siksastaka and a web application to illustrate the poem's
+analysis and annotations. Our key contributions include the proposed framework,
+the analysis of Siksastaka, the annotations and the web application for future
+research. Link for interactive analysis:
+https://sanskritshala.github.io/shikshastakam/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ #InsTag: Instruction Tagging for Diversity and Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keming Lu, Hongyi Yuan, Zheng Yuan, Runji Lin, Junyang Lin, Chuanqi Tan, Chang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation language models obtain the instruction-following ability through
+supervised fine-tuning (SFT). Diversity and complexity are considered critical
+factors of a successful SFT dataset, while their definitions remain obscure and
+lack quantitative analyses. In this work, we propose InsTag, an open-set
+fine-grained tagger, to tag samples within SFT datasets based on semantics and
+intentions and define instruction diversity and complexity regarding tags. We
+obtain 6.6K tags to describe comprehensive user queries. Then we analyze
+popular open-sourced SFT datasets and find that the model ability grows with
+more diverse and complex data. Based on this observation, we propose a data
+selector based on InsTag to select 6K diverse and complex samples from
+open-source datasets and fine-tune models on InsTag-selected data. The
+resulting models, TagLM, outperform open-source models based on considerably
+larger SFT data evaluated by MT-Bench, echoing the importance of query
+diversity and complexity. We open-source InsTag in
+https://github.com/OFA-Sys/InsTag.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Knowledge Graphs Simplify Text? <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Colas, Haodi Ma, Xuanli He, Yang Bai, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG)-to-Text Generation has seen recent improvements in
+generating fluent and informative sentences which describe a given KG. As KGs
+are widespread across multiple domains and contain important entity-relation
+information, and as text simplification aims to reduce the complexity of a text
+while preserving the meaning of the original text, we propose KGSimple, a novel
+approach to unsupervised text simplification which infuses KG-established
+techniques in order to construct a simplified KG path and generate a concise
+text which preserves the original input's meaning. Through an iterative and
+sampling KG-first approach, our model is capable of simplifying text when
+starting from a KG by learning to keep important information while harnessing
+KG-to-text generation to output fluent and descriptive sentences. We evaluate
+various settings of the KGSimple model on currently-available KG-to-text
+datasets, demonstrating its effectiveness compared to unsupervised text
+simplification models which start with a given complex text. Our code is
+available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a Main Conference Long Paper at CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ecom<span class="highlight-title">GPT</span>: Instruction-tuning Large Language Model with Chain-of-Task
+  Tasks for E-commerce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangning Li, Shirong Ma, Xiaobin Wang, Shen Huang, Chengyue Jiang, Hai-Tao Zheng, Pengjun Xie, Fei Huang, Yong Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, instruction-following Large Language Models (LLMs) , represented by
+ChatGPT, have exhibited exceptional performance in general Natural Language
+Processing (NLP) tasks. However, the unique characteristics of E-commerce data
+pose significant challenges to general LLMs. An LLM tailored specifically for
+E-commerce scenarios, possessing robust cross-dataset/task generalization
+capabilities, is a pressing necessity. To solve this issue, in this work, we
+proposed the first e-commerce instruction dataset EcomInstruct, with a total of
+2.5 million instruction data. EcomInstruct scales up the data size and task
+diversity by constructing atomic tasks with E-commerce basic data types, such
+as product information, user reviews. Atomic tasks are defined as intermediate
+tasks implicitly involved in solving a final task, which we also call
+Chain-of-Task tasks. We developed EcomGPT with different parameter scales by
+training the backbone model BLOOMZ with the EcomInstruct. Benefiting from the
+fundamental semantic understanding capabilities acquired from the Chain-of-Task
+tasks, EcomGPT exhibits excellent zero-shot generalization capabilities.
+Extensive experiments and human evaluations demonstrate that EcomGPT
+outperforms ChatGPT in term of cross-dataset/task generalization on E-commerce
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Initial version of EcomGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained
+  Text Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Heineman, Yao Dou, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained, span-level human evaluation has emerged as a reliable and
+robust method for evaluating text generation tasks such as summarization,
+simplification, machine translation and news generation, and the derived
+annotations have been useful for training automatic metrics and improving
+language models. However, existing annotation tools implemented for these
+evaluation frameworks lack the adaptability to be extended to different domains
+or languages, or modify annotation settings according to user needs. And the
+absence of a unified annotated data format inhibits the research in multi-task
+learning. In this paper, we introduce Thresh, a unified, customizable and
+deployable platform for fine-grained evaluation. By simply creating a YAML
+configuration file, users can build and test an annotation interface for any
+framework within minutes -- all in one web browser window. To facilitate
+collaboration and sharing, Thresh provides a community hub that hosts a
+collection of fine-grained frameworks and corresponding annotations made and
+collected by the community, covering a wide range of NLP tasks. For deployment,
+Thresh offers multiple options for any scale of annotation projects from small
+manual inspections to large crowdsourcing ones. Additionally, we introduce a
+Python library to streamline the entire process from typology design and
+deployment to annotation processing. Thresh is publicly accessible at
+https://thresh.tools.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximating Human-Like Few-shot Learning with <span class="highlight-title">GPT</span>-based Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cynthia Huang, Yuqing Xie, Zhiying Jiang, Jimmy Lin, Ming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we conceptualize the learning process as information
+compression. We seek to equip generative pre-trained models with human-like
+learning capabilities that enable data compression during inference. We present
+a novel approach that utilizes the Generative Pre-trained Transformer (GPT) to
+approximate Kolmogorov complexity, with the aim of estimating the optimal
+Information Distance for few-shot learning. We first propose using GPT as a
+prior for lossless text compression, achieving a noteworthy compression ratio.
+Experiment with LLAMA2-7B backbone achieves a compression ratio of 15.5 on
+enwik9. We justify the pre-training objective of GPT models by demonstrating
+its equivalence to the compression length, and, consequently, its ability to
+approximate the information distance for texts. Leveraging the approximated
+information distance, our method allows the direct application of GPT models in
+quantitative text similarity measurements. Experiment results show that our
+method overall achieves superior performance compared to embedding and prompt
+baselines on challenging NLP tasks, including semantic similarity, zero and
+one-shot text classification, and zero-shot text ranking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CausalLM is not optimal for in-context learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Ding, Tomer Levinboim, Jialin Wu, Sebastian Goodman, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent empirical evidence indicates that transformer based in-context
+learning performs better when using a prefix language model (prefixLM), in
+which in-context samples can all attend to each other, compared to causal
+language models (causalLM), which use auto-regressive attention that prohibits
+in-context samples to attend to future samples. While this result is intuitive,
+it is not understood from a theoretical perspective. In this paper we take a
+theoretical approach and analyze the convergence behavior of prefixLM and
+causalLM under a certain parameter construction. Our analysis shows that both
+LM types converge to their stationary points at a linear rate, but that while
+prefixLM converges to the optimal solution of linear regression, causalLM
+convergence dynamics follows that of an online gradient descent algorithm,
+which is not guaranteed to be optimal even as the number of samples grows
+infinitely. We supplement our theoretical claims with empirical experiments
+over synthetic and real tasks and using various types of transformers. Our
+experiments verify that causalLM consistently underperforms prefixLM in all
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with
+  Graph, Image, and Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Liu, Yiming Ren, Zhixiang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have made significant strides in natural language
+processing, paving the way for innovative applications including molecular
+representation and generation. However, most existing single-modality
+approaches cannot capture the abundant and complex information in molecular
+data. Here, we introduce GIT-Mol, a multi-modal large language model that
+integrates the structure Graph, Image, and Text information, including the
+Simplified Molecular Input Line Entry System (SMILES) and molecular captions.
+To facilitate the integration of multi-modal molecular data, we propose
+GIT-Former, a novel architecture capable of mapping all modalities into a
+unified latent space. Our study develops an innovative any-to-language
+molecular translation strategy and achieves a 10%-15% improvement in molecular
+captioning, a 5%-10% accuracy increase in property prediction, and a 20% boost
+in molecule generation validity compared to baseline or single-modality models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonathan A. Arbel, David Hoffman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce generative interpretation, a new approach to estimating
+contractual meaning using large language models. As AI triumphalism is the
+order of the day, we proceed by way of grounded case studies, each illustrating
+the capabilities of these novel tools in distinct ways. Taking well-known
+contracts opinions, and sourcing the actual agreements that they adjudicated,
+we show that AI models can help factfinders ascertain ordinary meaning in
+context, quantify ambiguity, and fill gaps in parties' agreements. We also
+illustrate how models can calculate the probative value of individual pieces of
+extrinsic evidence. After offering best practices for the use of these models
+given their limitations, we consider their implications for judicial practice
+and contract theory. Using LLMs permits courts to estimate what the parties
+intended cheaply and accurately, and as such generative interpretation
+unsettles the current interpretative stalemate. Their use responds to
+efficiency-minded textualists and justice-oriented contextualists, who argue
+about whether parties will prefer cost and certainty or accuracy and fairness.
+Parties--and courts--would prefer a middle path, in which adjudicators strive
+to predict what the contract really meant, admitting just enough context to
+approximate reality while avoiding unguided and biased assimilation of
+evidence. As generative interpretation offers this possibility, we argue it can
+become the new workhorse of contractual interpretation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechX: Neural Codec Language Model as a Versatile Speech <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofei Wang, Manthan Thakker, Zhuo Chen, Naoyuki Kanda, Sefik Emre Eskimez, Sanyuan Chen, Min Tang, Shujie Liu, Jinyu Li, Takuya Yoshioka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative speech models based on audio-text prompts
+have enabled remarkable innovations like high-quality zero-shot text-to-speech.
+However, existing models still face limitations in handling diverse audio-text
+speech generation tasks involving transforming input speech and processing
+audio captured in adverse acoustic conditions. This paper introduces SpeechX, a
+versatile speech generation model capable of zero-shot TTS and various speech
+transformation tasks, dealing with both clean and noisy signals. SpeechX
+combines neural codec language modeling with multi-task learning using
+task-dependent prompting, enabling unified and extensible modeling and
+providing a consistent way for leveraging textual input in speech enhancement
+and transformation tasks. Experimental results show SpeechX's efficacy in
+various tasks, including zero-shot TTS, noise suppression, target speaker
+extraction, speech removal, and speech editing with or without background
+noise, achieving comparable or superior performance to specialized models
+across tasks. See https://aka.ms/speechx for demo samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See https://aka.ms/speechx for demo samples</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NECE: Narrative Event Chain Extraction Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.08063v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.08063v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangxuan Xu, Paulina Toro Isaza, Moshi Li, Akintoye Oloko, Bingsheng Yao, Cassia Sanctos, Aminat Adebiyi, Yufang Hou, Nanyun Peng, Dakuo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To understand a narrative, it is essential to comprehend the temporal event
+flows, especially those associated with main characters; however, this can be
+challenging with lengthy and unstructured narrative texts. To address this, we
+introduce NECE, an open-access, document-level toolkit that automatically
+extracts and aligns narrative events in the temporal order of their occurrence.
+Through extensive evaluations, we show the high quality of the NECE toolkit and
+demonstrates its downstream application in analyzing narrative bias regarding
+gender. We also openly discuss the shortcomings of the current approach, and
+potential of leveraging generative models in future works. Lastly the NECE
+toolkit includes both a Python library and a user-friendly web interface, which
+offer equal access to professionals and layman audience alike, to visualize
+event chain, obtain narrative flows, or study narrative bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Alignment with Instruction Backtranslation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Li, Ping Yu, Chunting Zhou, Timo Schick, Luke Zettlemoyer, Omer Levy, Jason Weston, Mike Lewis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a scalable method to build a high quality instruction following
+language model by automatically labelling human-written text with corresponding
+instructions. Our approach, named instruction backtranslation, starts with a
+language model finetuned on a small amount of seed data, and a given web
+corpus. The seed model is used to construct training examples by generating
+instruction prompts for web documents (self-augmentation), and then selecting
+high quality examples from among these candidates (self-curation). This data is
+then used to finetune a stronger model. Finetuning LLaMa on two iterations of
+our approach yields a model that outperforms all other LLaMa-based models on
+the Alpaca leaderboard not relying on distillation data, demonstrating highly
+effective self-alignment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning for API Aspect Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16878v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16878v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G. M. Shahariar, Tahmid Hasan, Anindya Iqbal, Gias Uddin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach - CLAA - for API aspect detection in API reviews
+that utilizes transformer models trained with a supervised contrastive loss
+objective function. We evaluate CLAA using performance and impact analysis. For
+performance analysis, we utilized a benchmark dataset on developer discussions
+collected from Stack Overflow and compare the results to those obtained using
+state-of-the-art transformer models. Our experiments show that contrastive
+learning can significantly improve the performance of transformer models in
+detecting aspects such as Performance, Security, Usability, and Documentation.
+For impact analysis, we performed empirical and developer study. On a randomly
+selected and manually labeled 200 online reviews, CLAA achieved 92% accuracy
+while the SOTA baseline achieved 81.5%. According to our developer study
+involving 10 participants, the use of 'Stack Overflow + CLAA' resulted in
+increased accuracy and confidence during API selection. Replication package:
+https://github.com/disa-lab/Contrastive-Learning-API-Aspect-ASE2023
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 38th IEEE/ACM International Conference on Automated
+  Software Engineering (ASE2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach
+  for Speech Emotion Recognition <span class="chip">ICASSP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08233v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08233v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Ye, Xin-cheng Wen, Yujie Wei, Yong Xu, Kunhong Liu, Hongming Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech emotion recognition (SER) plays a vital role in improving the
+interactions between humans and machines by inferring human emotion and
+affective states from speech signals. Whereas recent works primarily focus on
+mining spatiotemporal information from hand-crafted features, we explore how to
+model the temporal patterns of speech emotions from dynamic temporal scales.
+Towards that goal, we introduce a novel temporal emotional modeling approach
+for SER, termed Temporal-aware bI-direction Multi-scale Network (TIM-Net),
+which learns multi-scale contextual affective representations from various time
+scales. Specifically, TIM-Net first employs temporal-aware blocks to learn
+temporal affective representation, then integrates complementary information
+from the past and the future to enrich contextual representations, and finally,
+fuses multiple time scale features for better adaptation to the emotional
+variation. Extensive experimental results on six benchmark SER datasets
+demonstrate the superior performance of TIM-Net, gaining 2.34% and 2.61%
+improvements of the average UAR and WAR over the second-best on each corpus.
+The source code is available at https://github.com/Jiaxin-Ye/TIM-Net_SER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Correction Remain A Problem For Large Language Models? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01776v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01776v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowu Zhang, Xiaotian Zhang, Cheng Yang, Hang Yan, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models, such as GPT, continue to advance the capabilities
+of natural language processing (NLP), the question arises: does the problem of
+correction still persist? This paper investigates the role of correction in the
+context of large language models by conducting two experiments. The first
+experiment focuses on correction as a standalone task, employing few-shot
+learning techniques with GPT-like models for error correction. The second
+experiment explores the notion of correction as a preparatory task for other
+NLP tasks, examining whether large language models can tolerate and perform
+adequately on texts containing certain levels of noise or errors. By addressing
+these experiments, we aim to shed light on the significance of correction in
+the era of large language models and its implications for various NLP
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on
+  Class-level Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01861v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01861v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueying Du, Mingwei Liu, Kaixin Wang, Hanlin Wang, Junwei Liu, Yixuan Chen, Jiayi Feng, Chaofeng Sha, Xin Peng, Yiling Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we make the first attempt to evaluate LLMs in a more
+challenging code generation scenario, i.e. class-level code generation. We
+first manually construct the first class-level code generation benchmark
+ClassEval of 100 class-level Python code generation tasks with approximately
+500 person-hours. Based on it, we then perform the first study of 11
+state-of-the-art LLMs on class-level code generation. Based on our results, we
+have the following main findings. First, we find that all existing LLMs show
+much worse performance on class-level code generation compared to on standalone
+method-level code generation benchmarks like HumanEval; and the method-level
+coding ability cannot equivalently reflect the class-level coding ability among
+LLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior
+than other LLMs on class-level code generation, and the second-tier models
+includes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very
+similar performance. Third, we find that generating the entire class all at
+once (i.e. holistic generation strategy) is the best generation strategy only
+for GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and
+compositional) is better strategies for the other models with limited ability
+of understanding long instructions and utilizing the middle information.
+Lastly, we find the limited model ability of generating method-dependent code
+and discuss the frequent error types in generated classes. Our benchmark is
+available at https://github.com/FudanSELab/ClassEval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEAM: An Integrated Activation-Coupled Model of Sentence Processing and
+  Eye Movements in Reading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian M. Rabe, Dario Paape, Daniela Mertzen, Shravan Vasishth, Ralf Engbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models of eye-movement control during reading, developed largely within
+psychology, usually focus on visual, attentional, lexical, and motor processes
+but neglect post-lexical language processing; by contrast, models of sentence
+comprehension processes, developed largely within psycholinguistics, generally
+focus only on post-lexical language processes. We present a model that combines
+these two research threads, by integrating eye-movement control and sentence
+processing. Developing such an integrated model is extremely challenging and
+computationally demanding, but such an integration is an important step toward
+complete mathematical models of natural language comprehension in reading. We
+combine the SWIFT model of eye-movement control (Seelig et al., 2020,
+doi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth
+sentence processing model (Lewis & Vasishth, 2005,
+doi:10.1207/s15516709cog0000_25). This integration becomes possible, for the
+first time, due in part to recent advances in successful parameter
+identification in dynamical models, which allows us to investigate profile
+log-likelihoods for individual model parameters. We present a fully implemented
+proof-of-concept model demonstrating how such an integrated model can be
+achieved; our approach includes Bayesian model inference with Markov Chain
+Monte Carlo (MCMC) sampling as a key computational tool. The integrated model,
+SEAM, can successfully reproduce eye movement patterns that arise due to
+similarity-based interference in reading. To our knowledge, this is the
+first-ever integration of a complete process model of eye-movement control with
+linguistic dependency completion processes in sentence comprehension. In future
+work, this proof of concept model will need to be evaluated using a
+comprehensive set of benchmark data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skills-in-Context <span class="highlight-title">Prompt</span>ing: Unlocking Compositionality in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaao Chen, Xiaoman Pan, Dian Yu, Kaiqiang Song, Xiaoyang Wang, Dong Yu, Jianshu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of eliciting compositional generalization
+capabilities in large language models (LLMs) with a novel type of prompting
+strategy. Compositional generalization empowers the LLMs to solve problems that
+are harder than the ones they have seen (i.e., easy-to-hard generalization),
+which is a critical reasoning capability of human-like intelligence. However,
+even the current state-of-the-art LLMs still struggle with this form of
+reasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting,
+which instructs LLMs how to compose basic skills to resolve more complex
+problems. We find that it is crucial to demonstrate both the skills and the
+compositional examples within the same prompting context. With as few as two
+examplars, our SKiC prompting initiates strong synergies between skills and
+their composition capabilities. Notably, it empowers LLMs to solve unseen
+problems that require innovative skill compositions, achieving near-perfect
+generalization on a broad range of challenging compositionality tasks.
+Intriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling
+them to leverage pre-existing internal skills acquired during earlier
+pre-training stages, even when these skills are not explicitly presented in the
+prompting context. This results in the capability of LLMs to solve unseen
+complex problems by activating and composing internal competencies. With such
+prominent features, SKiC prompting is able to achieve state-of-the-art
+performance on challenging mathematical reasoning benchmarks (e.g., MATH).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Zero-Shot Text Matching for Financial Auditing with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06111v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06111v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Hillebrand, Armin Berger, Tobias Deußer, Tim Dilmaghani, Mohamed Khaled, Bernd Kliem, Rüdiger Loitz, Maren Pielka, David Leonhard, Christian Bauckhage, Rafet Sifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Auditing financial documents is a very tedious and time-consuming process. As
+of today, it can already be simplified by employing AI-based solutions to
+recommend relevant text passages from a report for each legal requirement of
+rigorous accounting standards. However, these methods need to be fine-tuned
+regularly, and they require abundant annotated data, which is often lacking in
+industrial environments. Hence, we present ZeroShotALI, a novel recommender
+system that leverages a state-of-the-art large language model (LLM) in
+conjunction with a domain-specifically optimized transformer-based
+text-matching solution. We find that a two-step approach of first retrieving a
+number of best matching document sections per legal requirement with a custom
+BERT-based model and second filtering these selections using an LLM yields
+significant performance improvements over existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at DocEng 2023, 4 pages, 1 figure, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language
+  Model through Expert Feedback and Real-world Multi-turn Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songhua Yang, Hanjie Zhao, Senbin Zhu, Guangyu Zhou, Hongfei Xu, Yuxiang Jia, Hongying Zan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) have achieved remarkable
+breakthroughs in understanding and responding to user intents. However, their
+performance lag behind general use cases in some expertise domains, such as
+Chinese medicine. Existing efforts to incorporate Chinese medicine into LLMs
+rely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue
+data. These models lack the ability for doctor-like proactive inquiry and
+multi-turn comprehension and cannot always align responses with safety and
+professionalism experts. In this work, we introduce Zhongjing, the first
+Chinese medical LLaMA-based LLM that implements an entire training pipeline
+from pre-training to reinforcement learning with human feedback (RLHF).
+Additionally, we introduce a Chinese multi-turn medical dialogue dataset of
+70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly
+enhances the model's capability for complex dialogue and proactive inquiry
+initiation. We define a refined annotation rule and evaluation criteria given
+the biomedical domain's unique characteristics. Results show that our model
+outperforms baselines in various capacities and matches the performance of
+ChatGPT in a few abilities, despite having 50x training data with previous best
+model and 100x parameters with ChatGPT. RLHF further improves the model's
+instruction-following ability and safety.We also release our code, datasets and
+model for further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">109</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jurassic World Remake: Bringing Ancient Fossils Back to Life via
+  Zero-Shot Long Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Martin, Haitian Zheng, Jie An, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With a strong understanding of the target domain from natural language, we
+produce promising results in translating across large domain gaps and bringing
+skeletons back to life. In this work, we use text-guided latent diffusion
+models for zero-shot image-to-image translation (I2I) across large domain gaps
+(longI2I), where large amounts of new visual features and new geometry need to
+be generated to enter the target domain. Being able to perform translations
+across large domain gaps has a wide variety of real-world applications in
+criminology, astrology, environmental conservation, and paleontology. In this
+work, we introduce a new task Skull2Animal for translating between skulls and
+living animals. On this task, we find that unguided Generative Adversarial
+Networks (GANs) are not capable of translating across large domain gaps.
+Instead of these traditional I2I methods, we explore the use of guided
+diffusion and image editing models and provide a new benchmark model,
+Revive-2I, capable of performing zero-shot I2I via text-prompting latent
+diffusion models. We find that guidance is necessary for longI2I because, to
+bridge the large domain gap, prior knowledge about the target domain is needed.
+In addition, we find that prompting provides the best and most scalable
+information about the target domain as classifier-guided diffusion models
+require retraining for specific use cases and lack stronger constraints on the
+target domain because of the wide variety of images they are trained on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Associated Encoder for Face Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ju Tsai, Yu-Lun Liu, Lu Qi, Kelvin C. K. Chan, Ming-Hsuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Restoring facial details from low-quality (LQ) images has remained a
+challenging problem due to its ill-posedness induced by various degradations in
+the wild. The existing codebook prior mitigates the ill-posedness by leveraging
+an autoencoder and learned codebook of high-quality (HQ) features, achieving
+remarkable quality. However, existing approaches in this paradigm frequently
+depend on a single encoder pre-trained on HQ data for restoring HQ images,
+disregarding the domain gap between LQ and HQ images. As a result, the encoding
+of LQ inputs may be insufficient, resulting in suboptimal performance. To
+tackle this problem, we propose a novel dual-branch framework named DAEFR. Our
+method introduces an auxiliary LQ branch that extracts crucial information from
+the LQ inputs. Additionally, we incorporate association training to promote
+effective synergy between the two branches, enhancing code prediction and
+output quality. We evaluate the effectiveness of DAEFR on both synthetic and
+real-world datasets, demonstrating its superior performance in restoring facial
+details.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Pose: A Simple Baseline for End-to-End Multi-person Pose
+  Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Liu, Qiang Chen, Zichang Tan, Jiang-Jiang Liu, Jian Wang, Xiangbo Su, Xiaolong Li, Kun Yao, Junyu Han, Errui Ding, Yao Zhao, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of end-to-end multi-person pose
+estimation. State-of-the-art solutions adopt the DETR-like framework, and
+mainly develop the complex decoder, e.g., regarding pose estimation as keypoint
+box detection and combining with human detection in ED-Pose, hierarchically
+predicting with pose decoder and joint (keypoint) decoder in PETR. We present a
+simple yet effective transformer approach, named Group Pose. We simply regard
+$K$-keypoint pose estimation as predicting a set of $N\times K$ keypoint
+positions, each from a keypoint query, as well as representing each pose with
+an instance query for scoring $N$ pose predictions. Motivated by the intuition
+that the interaction, among across-instance queries of different types, is not
+directly helpful, we make a simple modification to decoder self-attention. We
+replace single self-attention over all the $N\times(K+1)$ queries with two
+subsequent group self-attentions: (i) $N$ within-instance self-attention, with
+each over $K$ keypoint queries and one instance query, and (ii) $(K+1)$
+same-type across-instance self-attention, each over $N$ queries of the same
+type. The resulting decoder removes the interaction among across-instance
+type-different queries, easing the optimization and thus improving the
+performance. Experimental results on MS COCO and CrowdPose show that our
+approach without human box supervision is superior to previous methods with
+complex decoders, and even is slightly better than ED-Pose that uses human box
+supervision. $\href{https://github.com/Michel-liu/GroupPose-Paddle}{\rm
+Paddle}$ and $\href{https://github.com/Michel-liu/GroupPose}{\rm PyTorch}$ code
+are available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esteve Valls Mascaro, Hyemin Ahn, Dongheui Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The synthesis of human motion has traditionally been addressed through
+task-dependent models that focus on specific challenges, such as predicting
+future motions or filling in intermediate poses conditioned on known key-poses.
+In this paper, we present a novel task-independent model called UNIMASK-M,
+which can effectively address these challenges using a unified architecture.
+Our model obtains comparable or better performance than the state-of-the-art in
+each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model
+decomposes a human pose into body parts to leverage the spatio-temporal
+relationships existing in human motion. Moreover, we reformulate various
+pose-conditioned motion synthesis tasks as a reconstruction problem with
+different masking patterns given as input. By explicitly informing our model
+about the masked joints, our UNIMASK-M becomes more robust to occlusions.
+Experimental results show that our model successfully forecasts human motion on
+the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion
+inbetweening on the LaFAN1 dataset, particularly in long transition periods.
+More information can be found on the project website
+https://sites.google.com/view/estevevallsmascaro/publications/unimask-m.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate Eye Tracking from Dense 3D Surface Reconstructions using
+  Single-Shot Deflectometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhang Wang, Tianfu Wang, Bingjie Xu, Oliver Cossairt And Florian Willomitzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eye-tracking plays a crucial role in the development of virtual reality
+devices, neuroscience research, and psychology. Despite its significance in
+numerous applications, achieving an accurate, robust, and fast eye-tracking
+solution remains a considerable challenge for current state-of-the-art methods.
+While existing reflection-based techniques (e.g., "glint tracking") are
+considered the most accurate, their performance is limited by their reliance on
+sparse 3D surface data acquired solely from the cornea surface. In this paper,
+we rethink the way how specular reflections can be used for eye tracking: We
+propose a novel method for accurate and fast evaluation of the gaze direction
+that exploits teachings from single-shot phase-measuring-deflectometry (PMD).
+In contrast to state-of-the-art reflection-based methods, our method acquires
+dense 3D surface information of both cornea and sclera within only one single
+camera frame (single-shot). Improvements in acquired reflection surface
+points("glints") of factors $>3300 \times$ are easily achievable. We show the
+feasibility of our approach with experimentally evaluated gaze errors of only
+$\leq 0.25^\circ$ demonstrating a significant improvement over the current
+state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Robust Approach Towards Distinguishing Natural and Computer Generated
+  Images using Multi-Colorspace fused and Enriched Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manjary P Gangan, Anoop Kadan, Lajish V L
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The works in literature classifying natural and computer generated images are
+mostly designed as binary tasks either considering natural images versus
+computer graphics images only or natural images versus GAN generated images
+only, but not natural images versus both classes of the generated images. Also,
+even though this forensic classification task of distinguishing natural and
+computer generated images gets the support of the new convolutional neural
+networks and transformer based architectures that can give remarkable
+classification accuracies, they are seen to fail over the images that have
+undergone some post-processing operations usually performed to deceive the
+forensic algorithms, such as JPEG compression, gaussian noise, etc. This work
+proposes a robust approach towards distinguishing natural and computer
+generated images including both, computer graphics and GAN generated images
+using a fusion of two vision transformers where each of the transformer
+networks operates in different color spaces, one in RGB and the other in YCbCr
+color space. The proposed approach achieves high performance gain when compared
+to a set of baselines, and also achieves higher robustness and generalizability
+than the baselines. The features of the proposed model when visualized are seen
+to obtain higher separability for the classes than the input image features and
+the baseline features. This work also studies the attention map visualizations
+of the networks of the fused model and observes that the proposed methodology
+can capture more image information relevant to the forensic task of classifying
+natural and generated images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Ningyu Zhang, Xin Xie, Yunzhi Yao, Bozhong Tian, Mengru Wang, Zekun Xi, Siyuan Cheng, Kangwei Liu, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy
+issues, which means they are unaware of unseen events or generate text with
+incorrect facts owing to the outdated/noisy data. To this end, many knowledge
+editing approaches for LLMs have emerged -- aiming to subtly inject/edit
+updated knowledge or adjust undesired behavior while minimizing the impact on
+unrelated inputs. Nevertheless, due to significant differences among various
+knowledge editing methods and the variations in task setups, there is no
+standard implementation framework available for the community, which hinders
+practitioners to apply knowledge editing to applications. To address these
+issues, we propose EasyEdit, an easy-to-use knowledge editing framework for
+LLMs. It supports various cutting-edge knowledge editing approaches and can be
+readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.
+Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,
+demonstrating that knowledge editing surpasses traditional fine-tuning in terms
+of reliability and generalization. We have released the source code on GitHub
+at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and
+comprehensive documentation for beginners to get started. Besides, we present
+an online system for real-time knowledge editing, and a demo video at
+http://knowlm.zjukg.cn/easyedit.mp4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website is https://github.com/zjunlp/EasyEdit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diving with Penguins: Detecting Penguins and their Prey in Animal-borne
+  Underwater Videos via Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kejia Zhang, Mingyu Yang, Stephen D. J. Lang, Alistair M. McInnes, Richard B. Sherley, Tilo Burghardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  African penguins (Spheniscus demersus) are an endangered species. Little is
+known regarding their underwater hunting strategies and associated predation
+success rates, yet this is essential for guiding conservation. Modern
+bio-logging technology has the potential to provide valuable insights, but
+manually analysing large amounts of data from animal-borne video recorders
+(AVRs) is time-consuming. In this paper, we publish an animal-borne underwater
+video dataset of penguins and introduce a ready-to-deploy deep learning system
+capable of robustly detecting penguins (mAP50@98.0%) and also instances of fish
+(mAP50@73.3%). We note that the detectors benefit explicitly from air-bubble
+learning to improve accuracy. Extending this detector towards a dual-stream
+behaviour recognition network, we also provide the first results for
+identifying predation behaviour in penguin underwater videos. Whilst results
+are promising, further work is required for useful applicability of predation
+behaviour detection in field scenarios. In summary, we provide a highly
+reliable underwater penguin detector, a fish detector, and a valuable first
+attempt towards an automated visual detection of complex behaviours in a marine
+predator. We publish the networks, the DivingWithPenguins video dataset,
+annotations, splits, and weights for full reproducibility and immediate
+usability by practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures, 4 Tables, "3rd International Workshop on Camera
+  traps, AI, and Ecology (CamTrapAI)"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Real-time Smoke Filtration with 3D LiDAR for Search and Rescue
+  with Autonomous Heterogeneous Robotic Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kyuroson, Anton Koval, George Nikolakopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search and Rescue (SAR) missions in harsh and unstructured Sub-Terranean
+(Sub-T) environments in the presence of aerosol particles have recently become
+the main focus in the field of robotics. Aerosol particles such as smoke and
+dust directly affect the performance of any mobile robotic platform due to
+their reliance on their onboard perception systems for autonomous navigation
+and localization in Global Navigation Satellite System (GNSS)-denied
+environments. Although obstacle avoidance and object detection algorithms are
+robust to the presence of noise to some degree, their performance directly
+relies on the quality of captured data by onboard sensors such as Light
+Detection And Ranging (LiDAR) and camera. Thus, this paper proposes a novel
+modular agnostic filtration pipeline based on intensity and spatial information
+such as local point density for removal of detected smoke particles from Point
+Cloud (PCL) prior to its utilization for collision detection. Furthermore, the
+efficacy of the proposed framework in the presence of smoke during multiple
+frontier exploration missions is investigated while the experimental results
+are presented to facilitate comparison with other methodologies and their
+computational impact. This provides valuable insight to the research community
+for better utilization of filtration schemes based on available computation
+resources while considering the safe autonomous navigation of mobile robots.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 49th Annual Conference of the IEEE Industrial
+  Electronics Society [IECON2023]</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-kernel Attention for Efficient and Robust Brain Lesion
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Chalcroft, Ruben Lourenço Pereira, Mikael Brudfors, Andrew S. Kayser, Mark D'Esposito, Cathy J. Price, Ioannis Pappas, John Ashburner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers are effective deep learning models for vision tasks,
+including medical image segmentation. However, they lack efficiency and
+translational invariance, unlike convolutional neural networks (CNNs). To model
+long-range interactions in 3D brain lesion segmentation, we propose an
+all-convolutional transformer block variant of the U-Net architecture. We
+demonstrate that our model provides the greatest compromise in three factors:
+performance competitive with the state-of-the-art; parameter efficiency of a
+CNN; and the favourable inductive biases of a transformer. Our public
+implementation is available at https://github.com/liamchalcroft/MDUNet .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AAFACE: Attribute-aware Attentional Network for Face Recognition <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Alipour Talemi, Hossein Kashiani, Sahar Rahimi Malakshan, Mohammad Saeed Ebrahimi Saadabadi, Nima Najafzadeh, Mohammad Akyash, Nasser M. Nasrabadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a new multi-branch neural network that
+simultaneously performs soft biometric (SB) prediction as an auxiliary modality
+and face recognition (FR) as the main task. Our proposed network named AAFace
+utilizes SB attributes to enhance the discriminative ability of FR
+representation. To achieve this goal, we propose an attribute-aware attentional
+integration (AAI) module to perform weighted integration of FR with SB feature
+maps. Our proposed AAI module is not only fully context-aware but also capable
+of learning complex relationships between input features by means of the
+sequential multi-scale channel and spatial sub-modules. Experimental results
+verify the superiority of our proposed network compared with the
+state-of-the-art (SoTA) SB prediction and FR methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to $30^{th}$ IEEE International Conference on Image
+  Processing (ICIP 2023) as an oral presentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniWorld: Autonomous Driving <span class="highlight-title">Pre-train</span>ing via World Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Min, Dawei Zhao, Liang Xiao, Yiming Nie, Bin Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we draw inspiration from Alberto Elfes' pioneering work in
+1989, where he introduced the concept of the occupancy grid as World Models for
+robots. We imbue the robot with a spatial-temporal world model, termed
+UniWorld, to perceive its surroundings and predict the future behavior of other
+participants. UniWorld involves initially predicting 4D geometric occupancy as
+the World Models for foundational stage and subsequently fine-tuning on
+downstream tasks. UniWorld can estimate missing information concerning the
+world state and predict plausible future states of the world. Besides,
+UniWorld's pre-training process is label-free, enabling the utilization of
+massive amounts of image-LiDAR pairs to build a Foundational Model.The proposed
+unified pre-training framework demonstrates promising results in key tasks such
+as motion prediction, multi-camera 3D object detection, and surrounding
+semantic scene completion. When compared to monocular pre-training methods on
+the nuScenes dataset, UniWorld shows a significant improvement of about 1.5% in
+IoU for motion prediction, 2.0% in mAP and 2.0% in NDS for multi-camera 3D
+object detection, as well as a 3% increase in mIoU for surrounding semantic
+scene completion. By adopting our unified pre-training method, a 25% reduction
+in 3D training annotation costs can be achieved, offering significant practical
+value for the implementation of real-world autonomous driving. Codes are
+publicly available at https://github.com/chaytonmin/UniWorld.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures. arXiv admin note: substantial text overlap with
+  arXiv:2305.18829</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RestoreFormer++: Towards Real-World Blind Face Restoration from
+  Undegraded Key-Value Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhouxia Wang, Jiawei Zhang, Tianshui Chen, Wenping Wang, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face restoration aims at recovering high-quality face images from those
+with unknown degradations. Current algorithms mainly introduce priors to
+complement high-quality details and achieve impressive progress. However, most
+of these algorithms ignore abundant contextual information in the face and its
+interplay with the priors, leading to sub-optimal performance. Moreover, they
+pay less attention to the gap between the synthetic and real-world scenarios,
+limiting the robustness and generalization to real-world applications. In this
+work, we propose RestoreFormer++, which on the one hand introduces
+fully-spatial attention mechanisms to model the contextual information and the
+interplay with the priors, and on the other hand, explores an extending
+degrading model to help generate more realistic degraded face images to
+alleviate the synthetic-to-real-world gap. Compared with current algorithms,
+RestoreFormer++ has several crucial benefits. First, instead of using a
+multi-head self-attention mechanism like the traditional visual transformer, we
+introduce multi-head cross-attention over multi-scale features to fully explore
+spatial interactions between corrupted information and high-quality priors. In
+this way, it can facilitate RestoreFormer++ to restore face images with higher
+realness and fidelity. Second, in contrast to the recognition-oriented
+dictionary, we learn a reconstruction-oriented dictionary as priors, which
+contains more diverse high-quality facial details and better accords with the
+restoration target. Third, we introduce an extending degrading model that
+contains more realistic degraded scenarios for training data synthesizing, and
+thus helps to enhance the robustness and generalization of our RestoreFormer++
+model. Extensive experiments show that RestoreFormer++ outperforms
+state-of-the-art algorithms on both synthetic and real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TPAMI. An extension of RestoreFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DS-Depth: Dynamic and Static Depth Estimation via a Fusion Cost Volume 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Miao, Yang Bai, Haoran Duan, Yawen Huang, Fan Wan, Xinxing Xu, Yang Long, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised monocular depth estimation methods typically rely on the
+reprojection error to capture geometric relationships between successive frames
+in static environments. However, this assumption does not hold in dynamic
+objects in scenarios, leading to errors during the view synthesis stage, such
+as feature mismatch and occlusion, which can significantly reduce the accuracy
+of the generated depth maps. To address this problem, we propose a novel
+dynamic cost volume that exploits residual optical flow to describe moving
+objects, improving incorrectly occluded regions in static cost volumes used in
+previous work. Nevertheless, the dynamic cost volume inevitably generates extra
+occlusions and noise, thus we alleviate this by designing a fusion module that
+makes static and dynamic cost volumes compensate for each other. In other
+words, occlusion from the static volume is refined by the dynamic volume, and
+incorrect information from the dynamic volume is eliminated by the static
+volume. Furthermore, we propose a pyramid distillation loss to reduce
+photometric error inaccuracy at low resolutions and an adaptive photometric
+error loss to alleviate the flow direction of the large gradient in the
+occlusion regions. We conducted extensive experiments on the KITTI and
+Cityscapes datasets, and the results demonstrate that our model outperforms
+previously published baselines for self-supervised monocular depth estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distance Matters For Improving Performance Estimation Under Covariate
+  Shift <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mélanie Roschewitz, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance estimation under covariate shift is a crucial component of safe
+AI model deployment, especially for sensitive use-cases. Recently, several
+solutions were proposed to tackle this problem, most leveraging model
+predictions or softmax confidence to derive accuracy estimates. However, under
+dataset shifts, confidence scores may become ill-calibrated if samples are too
+far from the training distribution. In this work, we show that taking into
+account distances of test samples to their expected training distribution can
+significantly improve performance estimation under covariate shift. Precisely,
+we introduce a "distance-check" to flag samples that lie too far from the
+expected distribution, to avoid relying on their untrustworthy model outputs in
+the accuracy estimation step. We demonstrate the effectiveness of this method
+on 13 image classification tasks, across a wide-range of natural and synthetic
+distribution shifts and hundreds of models, with a median relative MAE
+improvement of 27% over the best baseline across all tasks, and SOTA
+performance on 10 out of 13 tasks. Our code is publicly available at
+https://github.com/melanibe/distance_matters_performance_estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV Workshop on Uncertainty Quantification for Computer
+  Vision 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel
+  Approach Using the BraTS AFRICA Challenge Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiranjeewee Prasad Koirala, Sovesh Mohapatra, Advait Gosai, Gottfried Schlaug
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors, particularly glioblastoma, continue to challenge medical
+diagnostics and treatments globally. This paper explores the application of
+deep learning to multi-modality magnetic resonance imaging (MRI) data for
+enhanced brain tumor segmentation precision in the Sub-Saharan Africa patient
+population. We introduce an ensemble method that comprises eleven unique
+variations based on three core architectures: UNet3D, ONet3D, SphereNet3D and
+modified loss functions. The study emphasizes the need for both age- and
+population-based segmentation models, to fully account for the complexities in
+the brain. Our findings reveal that the ensemble approach, combining different
+architectures, outperforms single models, leading to improved evaluation
+metrics. Specifically, the results exhibit Dice scores of 0.82, 0.82, and 0.87
+for enhancing tumor, tumor core, and whole tumor labels respectively. These
+results underline the potential of tailored deep learning techniques in
+precisely segmenting brain tumors and lay groundwork for future work to
+fine-tune models and assess performance across different brain regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 figs and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel
+  Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashidhar Reddy Javaji, Sovesh Mohapatra, Advait Gosai, Gottfried Schlaug
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors remain a critical global health challenge, necessitating
+advancements in diagnostic techniques and treatment methodologies. In response
+to the growing need for age-specific segmentation models, particularly for
+pediatric patients, this study explores the deployment of deep learning
+techniques using magnetic resonance imaging (MRI) modalities. By introducing a
+novel ensemble approach using ONet and modified versions of UNet, coupled with
+innovative loss functions, this study achieves a precise segmentation model for
+the BraTS-PEDs 2023 Challenge. Data augmentation, including both single and
+composite transformations, ensures model robustness and accuracy across
+different scanning protocols. The ensemble strategy, integrating the ONet and
+UNet models, shows greater effectiveness in capturing specific features and
+modeling diverse aspects of the MRI images which result in lesion_wise dice
+scores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor
+labels respectively. Visual comparisons further confirm the superiority of the
+ensemble method in accurate tumor region coverage. The results indicate that
+this advanced ensemble approach, building upon the unique strengths of
+individual models, offers promising prospects for enhanced diagnostic accuracy
+and effective treatment planning for brain tumors in pediatric brains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 Figs, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Data-Free Compression: Pruning and Quantization without
+  Fine-Tuning <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shipeng Bai, Jun Chen, Xintian Shen, Yixuan Qian, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured pruning and quantization are promising approaches for reducing the
+inference time and memory footprint of neural networks. However, most existing
+methods require the original training dataset to fine-tune the model. This not
+only brings heavy resource consumption but also is not possible for
+applications with sensitive or proprietary data due to privacy and security
+concerns. Therefore, a few data-free methods are proposed to address this
+problem, but they perform data-free pruning and quantization separately, which
+does not explore the complementarity of pruning and quantization. In this
+paper, we propose a novel framework named Unified Data-Free Compression(UDFC),
+which performs pruning and quantization simultaneously without any data and
+fine-tuning process. Specifically, UDFC starts with the assumption that the
+partial information of a damaged(e.g., pruned or quantized) channel can be
+preserved by a linear combination of other channels, and then derives the
+reconstruction form from the assumption to restore the information loss due to
+compression. Finally, we formulate the reconstruction error between the
+original network and its compressed network, and theoretically deduce the
+closed-form solution. We evaluate the UDFC on the large-scale image
+classification task and obtain significant improvements over various network
+architectures and compression methods. For example, we achieve a 20.54%
+accuracy improvement on ImageNet dataset compared to SOTA method with 30%
+pruning ratio and 6-bit quantization on ResNet-34.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on
+  Optical Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mufeng Yao, Jiaqi Wang, Jinlong Peng, Mingmin Chi, Chao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple object tracking (MOT) has been successfully investigated in computer
+vision.
+  However, MOT for the videos captured by unmanned aerial vehicles (UAV) is
+still challenging due to small object size, blurred object appearance, and very
+large and/or irregular motion in both ground objects and UAV platforms.
+  In this paper, we propose FOLT to mitigate these problems and reach fast and
+accurate MOT in UAV view.
+  Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and
+light-weight optical flow extractor to extract object detection features and
+motion features at a minimum cost.
+  Given the extracted flow, the flow-guided feature augmentation is designed to
+augment the object detection feature based on its optical flow, which improves
+the detection of small objects.
+  Then the flow-guided motion prediction is also proposed to predict the
+object's position in the next frame, which improves the tracking performance of
+objects with very large displacements between adjacent frames.
+  Finally, the tracker matches the detected objects and predicted objects using
+a spatially matching scheme to generate tracks for every object.
+  Experiments on Visdrone and UAVDT datasets show that our proposed model can
+successfully track small objects with large and irregular motion and outperform
+existing state-of-the-art methods in UAV-MOT tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust Real-Time Scene Text Detection: From Semantic to Instance
+  Representation Learning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xugong Qin, Pengyuan Lyu, Chengquan Zhang, Yu Zhou, Kun Yao, Peng Zhang, Hailun Lin, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the flexible representation of arbitrary-shaped scene text and simple
+pipeline, bottom-up segmentation-based methods begin to be mainstream in
+real-time scene text detection. Despite great progress, these methods show
+deficiencies in robustness and still suffer from false positives and instance
+adhesion. Different from existing methods which integrate multiple-granularity
+features or multiple outputs, we resort to the perspective of representation
+learning in which auxiliary tasks are utilized to enable the encoder to jointly
+learn robust features with the main task of per-pixel classification during
+optimization. For semantic representation learning, we propose global-dense
+semantic contrast (GDSC), in which a vector is extracted for global semantic
+representation, then used to perform element-wise contrast with the dense grid
+features. To learn instance-aware representation, we propose to combine
+top-down modeling (TDM) with the bottom-up framework to provide implicit
+instance-level clues for the encoder. With the proposed GDSC and TDM, the
+encoder network learns stronger representation without introducing any
+parameters and computations during inference. Equipped with a very light
+decoder, the detector can achieve more robust real-time scene text detection.
+Experimental results on four public datasets show that the proposed method can
+outperform or be comparable to the state-of-the-art on both accuracy and speed.
+Specifically, the proposed method achieves 87.2% F-measure with 48.2 FPS on
+Total-Text and 89.6% F-measure with 36.9 FPS on MSRA-TD500 on a single GeForce
+RTX 2080 Ti GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEMI-CenterNet: A Machine Learning Facilitated Approach for
+  Semiconductor Defect Inspection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vic De Ridder, Bappaditya Dey, Enrique Dehaerne, Sandip Halder, Stefan De Gendt, Bartel Van Waeyenberge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual shrinking of pattern dimensions in the semiconductor domain is
+making it increasingly difficult to inspect defects due to factors such as the
+presence of stochastic noise and the dynamic behavior of defect patterns and
+types. Conventional rule-based methods and non-parametric supervised machine
+learning algorithms like KNN mostly fail at the requirements of semiconductor
+defect inspection at these advanced nodes. Deep Learning (DL)-based methods
+have gained popularity in the semiconductor defect inspection domain because
+they have been proven robust towards these challenging scenarios. In this
+research work, we have presented an automated DL-based approach for efficient
+localization and classification of defects in SEM images. We have proposed
+SEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of
+semiconductor wafer defects. The use of the proposed CN approach allows
+improved computational efficiency compared to previously studied DL models.
+SEMI-CN gets trained to output the center, class, size, and offset of a defect
+instance. This is different from the approach of most object detection models
+that use anchors for bounding box prediction. Previous methods predict
+redundant bounding boxes, most of which are discarded in postprocessing. CN
+mitigates this by only predicting boxes for likely defect center points. We
+train SEMI-CN on two datasets and benchmark two ResNet backbones for the
+framework. Initially, ResNet models pretrained on the COCO dataset undergo
+training using two datasets separately. Primarily, SEMI-CN shows significant
+improvement in inference time against previous research works. Finally,
+transfer learning (using weights of custom SEM dataset) is applied from ADI
+dataset to AEI dataset and vice-versa, which reduces the required training time
+for both backbones to reach the best mAP against conventional training method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperSparse Neural Networks: Shifting Exploration to Exploitation
+  through Adaptive Regularization <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Glandorf, Timo Kaiser, Bodo Rosenhahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse neural networks are a key factor in developing resource-efficient
+machine learning applications. We propose the novel and powerful sparse
+learning method Adaptive Regularized Training (ART) to compress dense into
+sparse networks. Instead of the commonly used binary mask during training to
+reduce the number of model weights, we inherently shrink weights close to zero
+in an iterative manner with increasing weight regularization. Our method
+compresses the pre-trained model knowledge into the weights of highest
+magnitude. Therefore, we introduce a novel regularization loss named
+HyperSparse that exploits the highest weights while conserving the ability of
+weight exploration. Extensive experiments on CIFAR and TinyImageNet show that
+our method leads to notable performance gains compared to other sparsification
+methods, especially in extremely high sparsity regimes up to 99.8 percent model
+sparsity. Additional investigations provide new insights into the patterns that
+are encoded in weights with high magnitudes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23 Workshops</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM Meets Robotic Surgery: An Empirical Study on Generalization,
+  Robustness and Adaptation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Wang, Mobarakol Islam, Mengya Xu, Yang Zhang, Hongliang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) serves as a fundamental model for semantic
+segmentation and demonstrates remarkable generalization capabilities across a
+wide range of downstream scenarios. In this empirical study, we examine SAM's
+robustness and zero-shot generalizability in the field of robotic surgery. We
+comprehensively explore different scenarios, including prompted and unprompted
+situations, bounding box and points-based prompt approaches, as well as the
+ability to generalize under corruptions and perturbations at five severity
+levels. Additionally, we compare the performance of SAM with state-of-the-art
+supervised models. We conduct all the experiments with two well-known robotic
+instrument segmentation datasets from MICCAI EndoVis 2017 and 2018 challenges.
+Our extensive evaluation results reveal that although SAM shows remarkable
+zero-shot generalization ability with bounding box prompts, it struggles to
+segment the whole instrument with point-based prompts and unprompted settings.
+Furthermore, our qualitative figures demonstrate that the model either failed
+to predict certain parts of the instrument mask (e.g., jaws, wrist) or
+predicted parts of the instrument as wrong classes in the scenario of
+overlapping instruments within the same bounding box or with the point-based
+prompt. In fact, SAM struggles to identify instruments in complex surgical
+scenarios characterized by the presence of blood, reflection, blur, and shade.
+Additionally, SAM is insufficiently robust to maintain high performance when
+subjected to various forms of data corruption. We also attempt to fine-tune SAM
+using Low-rank Adaptation (LoRA) and propose SurgicalSAM, which shows the
+capability in class-wise mask prediction without prompt. Therefore, we can
+argue that, without further domain-specific fine-tuning, SAM is not ready for
+downstream surgical tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as Oral Presentation at MedAGI Workshop - MICCAI 2023 1st
+  International Workshop on Foundation Models for General Medical AI. arXiv
+  admin note: substantial text overlap with arXiv:2304.14674</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DELO: Deep Evidential LiDAR Odometry using Partial Optimal Transport <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sk Aziz Ali, Djamila Aouada, Gerd Reis, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate, robust, and real-time LiDAR-based odometry (LO) is imperative for
+many applications like robot navigation, globally consistent 3D scene map
+reconstruction, or safe motion-planning. Though LiDAR sensor is known for its
+precise range measurement, the non-uniform and uncertain point sampling density
+induce structural inconsistencies. Hence, existing supervised and unsupervised
+point set registration methods fail to establish one-to-one matching
+correspondences between LiDAR frames. We introduce a novel deep learning-based
+real-time (approx. 35-40ms per frame) LO method that jointly learns accurate
+frame-to-frame correspondences and model's predictive uncertainty (PU) as
+evidence to safe-guard LO predictions. In this work, we propose (i) partial
+optimal transportation of LiDAR feature descriptor for robust LO estimation,
+(ii) joint learning of predictive uncertainty while learning odometry over
+driving sequences, and (iii) demonstrate how PU can serve as evidence for
+necessary pose-graph optimization when LO network is either under or over
+confident. We evaluate our method on KITTI dataset and show competitive
+performance, even superior generalization ability over recent state-of-the-art
+approaches. Source codes are available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV 2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Based Augmentation for Captioning and Retrieval in Cultural
+  Heritage <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dario Cioni, Lorenzo Berlincioni, Federico Becattini, Alberto del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cultural heritage applications and advanced machine learning models are
+creating a fruitful synergy to provide effective and accessible ways of
+interacting with artworks. Smart audio-guides, personalized art-related content
+and gamification approaches are just a few examples of how technology can be
+exploited to provide additional value to artists or exhibitions. Nonetheless,
+from a machine learning point of view, the amount of available artistic data is
+often not enough to train effective models. Off-the-shelf computer vision
+modules can still be exploited to some extent, yet a severe domain shift is
+present between art images and standard natural image datasets used to train
+such models. As a result, this can lead to degraded performance. This paper
+introduces a novel approach to address the challenges of limited annotated data
+and domain shifts in the cultural heritage domain. By leveraging generative
+vision-language models, we augment art datasets by generating diverse
+variations of artworks conditioned on their captions. This augmentation
+strategy enhances dataset diversity, bridging the gap between natural images
+and artworks, and improving the alignment of visual cues with knowledge from
+general-purpose datasets. The generated variations assist in training vision
+and language models with a deeper understanding of artistic characteristics and
+that are able to generate better captions with appropriate jargon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023 4th Workshop on e-Heritage</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTP: Towards Vision-Language Continual <span class="highlight-title">Pretrain</span>ing via Compatible
+  Momentum Contrast and Topology Preservation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongguang Zhu, Yunchao Wei, Xiaodan Liang, Chunjie Zhang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Pretraining (VLP) has shown impressive results on diverse
+downstream tasks by offline training on large-scale datasets. Regarding the
+growing nature of real-world data, such an offline training paradigm on
+ever-expanding data is unsustainable, because models lack the continual
+learning ability to accumulate knowledge constantly. However, most continual
+learning studies are limited to uni-modal classification and existing
+multi-modal datasets cannot simulate continual non-stationary data stream
+scenarios. To support the study of Vision-Language Continual Pretraining
+(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D
+which contains over one million product image-text pairs from 9 industries. The
+data from each industry as an independent task supports continual learning and
+conforms to the real-world long-tail nature to simulate pretraining on web
+data. We comprehensively study the characteristics and challenges of VLCP, and
+propose a new algorithm: Compatible momentum contrast with Topology
+Preservation, dubbed CTP. The compatible momentum model absorbs the knowledge
+of the current and previous-task models to flexibly update the modal feature.
+Moreover, Topology Preservation transfers the knowledge of embedding across
+tasks while preserving the flexibility of feature adjustment. The experimental
+results demonstrate our method not only achieves superior performance compared
+with other baselines but also does not bring an expensive training burden.
+Dataset and codes are available at https://github.com/KevinLight831/CTP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Time-aware tensor decomposition for tracking evolving patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Chatzis, Max Pfeffer, Pedro Lind, Evrim Acar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-evolving data sets can often be arranged as a higher-order tensor with
+one of the modes being the time mode. While tensor factorizations have been
+successfully used to capture the underlying patterns in such higher-order data
+sets, the temporal aspect is often ignored, allowing for the reordering of time
+points. In recent studies, temporal regularizers are incorporated in the time
+mode to tackle this issue. Nevertheless, existing approaches still do not allow
+underlying patterns to change in time (e.g., spatial changes in the brain,
+contextual changes in topics). In this paper, we propose temporal PARAFAC2
+(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal
+regularization to extract gradually evolving patterns from temporal data.
+Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2
+can capture the underlying evolving patterns accurately performing better than
+PARAFAC2 and coupled matrix factorization with temporal smoothness
+regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Outlook into the Future of Egocentric Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiara Plizzari, Gabriele Goletto, Antonino Furnari, Siddhant Bansal, Francesco Ragusa, Giovanni Maria Farinella, Dima Damen, Tatiana Tommasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What will the future be? We wonder! In this survey, we explore the gap
+between current research in egocentric vision and the ever-anticipated future,
+where wearable computing, with outward facing cameras and digital overlays, is
+expected to be integrated in our every day lives. To understand this gap, the
+article starts by envisaging the future through character-based stories,
+showcasing through examples the limitations of current technology. We then
+provide a mapping between this future and previously defined research tasks.
+For each task, we survey its seminal works, current state-of-the-art
+methodologies and available datasets, then reflect on shortcomings that limit
+its applicability to future research. Note that this survey focuses on software
+models for egocentric vision, independent of any specific hardware. The paper
+concludes with recommendations for areas of immediate explorations so as to
+unlock our path to the future always-on, personalised and life-enhancing
+egocentric vision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We invite comments, suggestions and corrections here:
+  https://openreview.net/forum?id=V3974SUk1w</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Importance of Spatial Relations for Few-shot Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Zhang, Yuqian Fu, Xingjun Ma, Lizhe Qi, Jingjing Chen, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has achieved great success in video recognition, yet still
+struggles to recognize novel actions when faced with only a few examples. To
+tackle this challenge, few-shot action recognition methods have been proposed
+to transfer knowledge from a source dataset to a novel target dataset with only
+one or a few labeled videos. However, existing methods mainly focus on modeling
+the temporal relations between the query and support videos while ignoring the
+spatial relations. In this paper, we find that the spatial misalignment between
+objects also occurs in videos, notably more common than the temporal
+inconsistency. We are thus motivated to investigate the importance of spatial
+relations and propose a more accurate few-shot action recognition method that
+leverages both spatial and temporal information. Particularly, a novel Spatial
+Alignment Cross Transformer (SA-CT) which learns to re-adjust the spatial
+relations and incorporates the temporal information is contributed. Experiments
+reveal that, even without using any temporal information, the performance of
+SA-CT is comparable to temporal based methods on 3/4 benchmarks. To further
+incorporate the temporal information, we propose a simple yet effective
+Temporal Mixer module. The Temporal Mixer enhances the video representation and
+improves the performance of the full SA-CT model, achieving very competitive
+results. In this work, we also exploit large-scale pretrained models for
+few-shot action recognition, providing useful insights for this research
+direction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCSC: Spatial Cross-scale Convolution Module to Strengthen both CNNs and
+  <span class="highlight-title">Transformer</span>s <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xijun Wang, Xiaojie Chu, Chunrui Han, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a module, Spatial Cross-scale Convolution (SCSC), which
+is verified to be effective in improving both CNNs and Transformers. Nowadays,
+CNNs and Transformers have been successful in a variety of tasks. Especially
+for Transformers, increasing works achieve state-of-the-art performance in the
+computer vision community. Therefore, researchers start to explore the
+mechanism of those architectures. Large receptive fields, sparse connections,
+weight sharing, and dynamic weight have been considered keys to designing
+effective base models. However, there are still some issues to be addressed:
+large dense kernels and self-attention are inefficient, and large receptive
+fields make it hard to capture local features. Inspired by the above analyses
+and to solve the mentioned problems, in this paper, we design a general module
+taking in these design keys to enhance both CNNs and Transformers. SCSC
+introduces an efficient spatial cross-scale encoder and spatial embed module to
+capture assorted features in one layer. On the face recognition task,
+FaceResNet with SCSC can improve 2.7% with 68% fewer FLOPs and 79% fewer
+parameters. On the ImageNet classification task, Swin Transformer with SCSC can
+achieve even better performance with 22% fewer FLOPs, and ResNet with CSCS can
+improve 5.3% with similar complexity. Furthermore, a traditional network (e.g.,
+ResNet) embedded with SCSC can match Swin Transformer's performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 Workshop (New Ideas in Vision Transformers)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Checklist to Transparently Define Test Oracles for TP, FP, and FN
+  Objects in Automated Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Hoss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular test oracles for the perception subsystem of driving automation
+systems identify true-positive (TP), false-positive (FP), and false-negative
+(FN) objects. Oracle transparency is needed for comparing test results and for
+safety cases. To date, there exists a common notion of TPs, FPs, and FNs in the
+field, but apparently no published way to comprehensively define their oracles.
+Therefore, this paper provides a checklist of functional aspects and
+implementation details that affect the oracle behavior. Besides labeling
+policies of the test set, we cover fields of view, occlusion handling,
+safety-relevant areas, matching criteria, temporal and probabilistic issues,
+and further aspects. Even though our checklist can hardly be formalized, it can
+help practitioners maximize the transparency of their oracles, which, in turn,
+makes statements on object perception more reliable and comparable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FocusFlow: Boosting Key-Points Optical Flow Estimation for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghua Yi, Hao Shi, Kailun Yang, Qi Jiang, Yaozu Ye, Ze Wang, Kaiwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key-point-based scene understanding is fundamental for autonomous driving
+applications. At the same time, optical flow plays an important role in many
+vision tasks. However, due to the implicit bias of equal attention on all
+points, classic data-driven optical flow estimation methods yield less
+satisfactory performance on key points, limiting their implementations in
+key-point-critical safety-relevant scenarios. To address these issues, we
+introduce a points-based modeling method that requires the model to learn
+key-point-related priors explicitly. Based on the modeling method, we present
+FocusFlow, a framework consisting of 1) a mix loss function combined with a
+classic photometric loss function and our proposed Conditional Point Control
+Loss (CPCL) function for diverse point-wise supervision; 2) a conditioned
+controlling model which substitutes the conventional feature encoder by our
+proposed Condition Control Encoder (CCE). CCE incorporates a Frame Feature
+Encoder (FFE) that extracts features from frames, a Condition Feature Encoder
+(CFE) that learns to control the feature extraction behavior of FFE from input
+masks containing information of key points, and fusion modules that transfer
+the controlling information between FFE and CFE. Our FocusFlow framework shows
+outstanding performance with up to +44.5% precision improvement on various key
+points such as ORB, SIFT, and even learning-based SiLK, along with exceptional
+scalability for most existing data-driven optical flow methods like PWC-Net,
+RAFT, and FlowFormer. Notably, FocusFlow yields competitive or superior
+performances rivaling the original models on the whole frame. The source code
+will be available at https://github.com/ZhonghuaYi/FocusFlow_official.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code of FocusFlow will be available at
+  https://github.com/ZhonghuaYi/FocusFlow_official</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Sentence Grounding in Streaming Videos <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Gan, Xiao Wang, Yan Sun, Jianlong Wu, Qingpei Guo, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to tackle a novel task - Temporal Sentence Grounding in
+Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance
+between a video stream and a given sentence query. Unlike regular videos,
+streaming videos are acquired continuously from a particular source, and are
+always desired to be processed on-the-fly in many applications such as
+surveillance and live-stream analysis. Thus, TSGSV is challenging since it
+requires the model to infer without future frames and process long historical
+frames effectively, which is untouched in the early methods. To specifically
+address the above challenges, we propose two novel methods: (1) a TwinNet
+structure that enables the model to learn about upcoming events; and (2) a
+language-guided feature compressor that eliminates redundant visual frames and
+reinforces the frames that are relevant to the query. We conduct extensive
+experiments using ActivityNet Captions, TACoS, and MAD datasets. The results
+demonstrate the superiority of our proposed methods. A systematic ablation
+study also confirms their effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked Motion Predictors are Strong 3D Action Representation Learners <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunyao Mao, Jiajun Deng, Wengang Zhou, Yao Fang, Wanli Ouyang, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 3D human action recognition, limited supervised data makes it challenging
+to fully tap into the modeling potential of powerful networks such as
+transformers. As a result, researchers have been actively investigating
+effective self-supervised pre-training strategies. In this work, we show that
+instead of following the prevalent pretext task to perform masked
+self-component reconstruction in human joints, explicit contextual motion
+modeling is key to the success of learning effective feature representation for
+3D action recognition. Formally, we propose the Masked Motion Prediction (MAMP)
+framework. To be specific, the proposed MAMP takes as input the masked
+spatio-temporal skeleton sequence and predicts the corresponding temporal
+motion of the masked human joints. Considering the high temporal redundancy of
+the skeleton sequence, in our MAMP, the motion information also acts as an
+empirical semantic richness prior that guide the masking process, promoting
+better attention to semantically rich temporal regions. Extensive experiments
+on NTU-60, NTU-120, and PKU-MMD datasets show that the proposed MAMP
+pre-training substantially improves the performance of the adopted vanilla
+transformer, achieving state-of-the-art results without bells and whistles. The
+source code of our MAMP is available at https://github.com/maoyunyao/MAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ICPC: Instance-Conditioned <span class="highlight-title">Prompt</span>ing with Contrastive Learning for
+  Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaohui Yu, Qiang Zhou, Zhibin Wang, Fan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern supervised semantic segmentation methods are usually finetuned based
+on the supervised or self-supervised models pre-trained on ImageNet. Recent
+work shows that transferring the knowledge from CLIP to semantic segmentation
+via prompt learning can achieve promising performance. The performance boost
+comes from the feature enhancement with multimodal alignment, i.e., the dot
+product between vision and text embeddings. However, how to improve the
+multimodal alignment for better transfer performance in dense tasks remains
+underexplored. In this work, we focus on improving the quality of vision-text
+alignment from two aspects of prompting design and loss function, and present
+an instance-conditioned prompting with contrastive learning (ICPC) framework.
+First, compared with the static prompt designs, we reveal that dynamic
+prompting conditioned on image content can more efficiently utilize the text
+encoder for complex dense tasks. Second, we propose an align-guided contrastive
+loss to refine the alignment of vision and text embeddings. We further propose
+lightweight multi-scale alignment for better performance. Extensive experiments
+on three large-scale datasets (ADE20K, COCO-Stuff10k, and ADE20K-Full)
+demonstrate that ICPC brings consistent improvements across diverse backbones.
+Taking ResNet-50 as an example, ICPC outperforms the state-of-the-art
+counterpart by 1.71%, 1.05%, and 1.41% mIoU on the three datasets,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teeth And Root Canals Segmentation Using ZXYFormer With Uncertainty
+  Guidance And Weight Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangxuan Li, Yu Du, Li Ye, Chichi Li, Yanshu Fang, Cheng Wang, Wu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study attempts to segment teeth and root-canals simultaneously from CBCT
+images, but there are very challenging problems in this process. First, the
+clinical CBCT image data is very large (e.g., 672 *688 * 688), and the use of
+downsampling operation will lose useful information about teeth and root
+canals. Second, teeth and root canals are very different in morphology, and it
+is difficult for a simple network to identify them precisely. In addition,
+there are weak edges at the tooth, between tooth and root canal, which makes it
+very difficult to segment such weak edges. To this end, we propose a
+coarse-to-fine segmentation method based on inverse feature fusion transformer
+and uncertainty estimation to address above challenging problems. First, we use
+the downscaled volume data (e.g., 128 * 128 * 128) to conduct coarse
+segmentation and map it to the original volume to obtain the area of teeth and
+root canals. Then, we design a transformer with reverse feature fusion, which
+can bring better segmentation effect of different morphological objects by
+transferring deeper features to shallow features. Finally, we design an
+auxiliary branch to calculate and refine the difficult areas in order to
+improve the weak edge segmentation performance of teeth and root canals.
+Through the combined tooth and root canal segmentation experiment of 157
+clinical high-resolution CBCT data, it is verified that the proposed method is
+superior to the existing tooth or root canal segmentation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Local Iterative Approach for the Extraction of 2D Manifolds from
+  Strongly Curved and Folded Thin-Layer Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Klenert, Verena Lepper, Daniel Baum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ridge surfaces represent important features for the analysis of 3-dimensional
+(3D) datasets in diverse applications and are often derived from varying
+underlying data including flow fields, geological fault data, and point data,
+but they can also be present in the original scalar images acquired using a
+plethora of imaging techniques. Our work is motivated by the analysis of image
+data acquired using micro-computed tomography (Micro-CT) of ancient, rolled and
+folded thin-layer structures such as papyrus, parchment, and paper as well as
+silver and lead sheets. From these documents we know that they are
+2-dimensional (2D) in nature. Hence, we are particularly interested in
+reconstructing 2D manifolds that approximate the document's structure. The
+image data from which we want to reconstruct the 2D manifolds are often very
+noisy and represent folded, densely-layered structures with many artifacts,
+such as ruptures or layer splitting and merging. Previous ridge-surface
+extraction methods fail to extract the desired 2D manifold for such challenging
+data. We have therefore developed a novel method to extract 2D manifolds. The
+proposed method uses a local fast marching scheme in combination with a
+separation of the region covered by fast marching into two sub-regions. The 2D
+manifold of interest is then extracted as the surface separating the two
+sub-regions. The local scheme can be applied for both automatic propagation as
+well as interactive analysis. We demonstrate the applicability and robustness
+of our method on both artificial data as well as real-world data including
+folded silver and papyrus sheets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 21 figures, to be published in IEEE Transactions on
+  Visualization and Computer Graphics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagnosis of Scalp Disorders using Machine Learning and Deep Learning
+  Approach -- A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishabh Tiwari, Jatin Moolchandani, Shamla Mantri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The morbidity of scalp diseases is minuscule compared to other diseases, but
+the impact on the patient's life is enormous. It is common for people to
+experience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,
+Alopecia and Atopic-Dermatitis. In accordance with WHO research, approximately
+70% of adults have problems with their scalp. It has been demonstrated in
+descriptive research that hair quality is impaired by impaired scalp, but these
+impacts are reversible with early diagnosis and treatment. Deep Learning
+advances have demonstrated the effectiveness of CNN paired with FCN in
+diagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp
+inspection and diagnosis system, an imaging microscope and a trained model are
+combined with an app that classifies scalp disorders accurately with an average
+precision of 97.41%- 99.09%. Another research dealt with classifying the
+Psoriasis using the CNN with an accuracy of 82.9%. As part of another study, an
+ML based algorithm was also employed. It accurately classified the healthy
+scalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN
+algorithms. Using deep learning models to diagnose scalp related diseases has
+improved due to advancements i computation capabilities and computer vision,
+but there remains a wide horizon for further improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on video anomaly detection in dynamic scenes with moving cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runyu Jiao, Yi Wan, Fabio Poiesi, Yiming Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing popularity of compact and inexpensive cameras, e.g.~dash
+cameras, body cameras, and cameras equipped on robots, has sparked a growing
+interest in detecting anomalies within dynamic scenes recorded by moving
+cameras. However, existing reviews primarily concentrate on Video Anomaly
+Detection (VAD) methods assuming static cameras. The VAD literature with moving
+cameras remains fragmented, lacking comprehensive reviews to date. To address
+this gap, we endeavor to present the first comprehensive survey on Moving
+Camera Video Anomaly Detection (MC-VAD). We delve into the research papers
+related to MC-VAD, critically assessing their limitations and highlighting
+associated challenges. Our exploration encompasses three application domains:
+security, urban transportation, and marine environments, which in turn cover
+six specific tasks. We compile an extensive list of 25 publicly-available
+datasets spanning four distinct environments: underwater, water surface,
+ground, and aerial. We summarize the types of anomalies these datasets
+correspond to or contain, and present five main categories of approaches for
+detecting such anomalies. Lastly, we identify future research directions and
+discuss novel contributions that could advance the field of MC-VAD. With this
+survey, we aim to offer a valuable reference for researchers and practitioners
+striving to develop and advance state-of-the-art MC-VAD methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The minimal computational substrate of fluid intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amy PK Nelson, Joe Mole, Guilherme Pombo, Robert J Gray, James K Ruffle, Edgar Chan, Geraint E Rees, Lisa Cipolotti, Parashkev Nachev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quantification of cognitive powers rests on identifying a behavioural
+task that depends on them. Such dependence cannot be assured, for the powers a
+task invokes cannot be experimentally controlled or constrained a priori,
+resulting in unknown vulnerability to failure of specificity and
+generalisability. Evaluating a compact version of Raven's Advanced Progressive
+Matrices (RAPM), a widely used clinical test of fluid intelligence, we show
+that LaMa, a self-supervised artificial neural network trained solely on the
+completion of partially masked images of natural environmental scenes, achieves
+human-level test scores a prima vista, without any task-specific inductive bias
+or training. Compared with cohorts of healthy and focally lesioned
+participants, LaMa exhibits human-like variation with item difficulty, and
+produces errors characteristic of right frontal lobe damage under degradation
+of its ability to integrate global spatial patterns. LaMa's narrow training and
+limited capacity -- comparable to the nervous system of the fruit fly --
+suggest RAPM may be open to computationally simple solutions that need not
+necessarily invoke abstract reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Inherent Trade-Off in Noisy Neural Communication with Rank-Order
+  Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Alsolami, Tomoki Fukai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rank-order coding, a form of temporal coding, has emerged as a promising
+scheme to explain the rapid ability of the mammalian brain. Owing to its speed
+as well as efficiency, rank-order coding is increasingly gaining interest in
+diverse research areas beyond neuroscience. However, much uncertainty still
+exists about the performance of rank-order coding under noise. Herein we show
+what information rates are fundamentally possible and what trade-offs are at
+stake. An unexpected finding in this paper is the emergence of a special class
+of errors that, in a regime, increase with less noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S3IM: Stochastic Structural SIMilarity and Its Unreasonable
+  Effectiveness for Neural Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeke Xie, Xindi Yang, Yujie Yang, Qi Sun, Yixiang Jiang, Haoran Wang, Yunfeng Cai, Mingming Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Neural Radiance Field (NeRF) has shown great success in rendering
+novel-view images of a given scene by learning an implicit representation with
+only posed RGB images. NeRF and relevant neural field methods (e.g., neural
+surface representation) typically optimize a point-wise loss and make
+point-wise predictions, where one data point corresponds to one pixel.
+Unfortunately, this line of research failed to use the collective supervision
+of distant pixels, although it is known that pixels in an image or scene can
+provide rich structural information. To the best of our knowledge, we are the
+first to design a nonlocal multiplex training paradigm for NeRF and relevant
+neural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss
+that processes multiple data points as a whole set instead of process multiple
+inputs independently. Our extensive experiments demonstrate the unreasonable
+effectiveness of S3IM in improving NeRF and neural surface representation for
+nearly free. The improvements of quality metrics can be particularly
+significant for those relatively difficult tasks: e.g., the test MSE loss
+unexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view
+synthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance
+reduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is
+consistently robust even with sparse inputs, corrupted images, and dynamic
+scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14
+  pages, 5 figures, 17 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal
+  Contrastive Learning <span class="chip">ACM MM '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhou, Shengshan Hu, Minghui Li, Hangtao Zhang, Yechao Zhang, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal contrastive learning aims to train a general-purpose feature
+extractor, such as CLIP, on vast amounts of raw, unlabeled paired image-text
+data. This can greatly benefit various complex downstream tasks, including
+cross-modal image-text retrieval and image classification. Despite its
+promising prospect, the security issue of cross-modal pre-trained encoder has
+not been fully explored yet, especially when the pre-trained encoder is
+publicly available for commercial use.
+  In this work, we propose AdvCLIP, the first attack framework for generating
+downstream-agnostic adversarial examples based on cross-modal pre-trained
+encoders. AdvCLIP aims to construct a universal adversarial patch for a set of
+natural images that can fool all the downstream tasks inheriting the victim
+cross-modal pre-trained encoder. To address the challenges of heterogeneity
+between different modalities and unknown downstream tasks, we first build a
+topological graph structure to capture the relevant positions between target
+samples and their neighbors. Then, we design a topology-deviation based
+generative adversarial network to generate a universal adversarial patch. By
+adding the patch to images, we minimize their embeddings similarity to
+different modality and perturb the sample distribution in the feature space,
+achieving unviersal non-targeted attacks. Our results demonstrate the excellent
+attack performance of AdvCLIP on two types of downstream tasks across eight
+datasets. We also tailor three popular defenses to mitigate AdvCLIP,
+highlighting the need for new defense mechanisms to defend cross-modal
+pre-trained encoders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by the ACM International Conference on
+  Multimedia (ACM MM '23, October 29-November 3, 2023, Ottawa, ON, Canada)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PGT-Net: Progressive Guided Multi-task Neural Network for Small-area Wet
+  Fingerprint Denoising and Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ting Li, Ching-Te Chiu, An-Ting Hsieh, Mao-Hsiu Hsu, Long Wenyong, Jui-Min Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fingerprint recognition on mobile devices is an important method for identity
+verification. However, real fingerprints usually contain sweat and moisture
+which leads to poor recognition performance. In addition, for rolling out
+slimmer and thinner phones, technology companies reduce the size of recognition
+sensors by embedding them with the power button. Therefore, the limited size of
+fingerprint data also increases the difficulty of recognition. Denoising the
+small-area wet fingerprint images to clean ones becomes crucial to improve
+recognition performance. In this paper, we propose an end-to-end trainable
+progressive guided multi-task neural network (PGT-Net). The PGT-Net includes a
+shared stage and specific multi-task stages, enabling the network to train
+binary and non-binary fingerprints sequentially. The binary information is
+regarded as guidance for output enhancement which is enriched with the ridge
+and valley details. Moreover, a novel residual scaling mechanism is introduced
+to stabilize the training process. Experiment results on the FW9395 and
+FT-lightnoised dataset provided by FocalTech shows that PGT-Net has promising
+performance on the wet-fingerprint denoising and significantly improves the
+fingerprint recognition rate (FRR). On the FT-lightnoised dataset, the FRR of
+fingerprint recognition can be declined from 17.75% to 4.47%. On the FW9395
+dataset, the FRR of fingerprint recognition can be declined from 9.45% to
+1.09%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Bi-Projector for Unsupervised Domain Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin-Chieh Huang, Hung-Hsu Tsai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel unsupervised domain adaption (UDA) method based
+on contrastive bi-projector (CBP), which can improve the existing UDA methods.
+It is called CBPUDA here, which effectively promotes the feature extractors
+(FEs) to reduce the generation of ambiguous features for classification and
+domain adaption. The CBP differs from traditional bi-classifier-based methods
+at that these two classifiers are replaced with two projectors of performing a
+mapping from the input feature to two distinct features. These two projectors
+and the FEs in the CBPUDA can be trained adversarially to obtain more refined
+decision boundaries so that it can possess powerful classification performance.
+Two properties of the proposed loss function are analyzed here. The first
+property is to derive an upper bound of joint prediction entropy, which is used
+to form the proposed loss function, contrastive discrepancy (CD) loss. The CD
+loss takes the advantages of the contrastive learning and the bi-classifier.
+The second property is to analyze the gradient of the CD loss and then overcome
+the drawback of the CD loss. The result of the second property is utilized in
+the development of the gradient scaling (GS) scheme in this paper. The GS
+scheme can be exploited to tackle the unstable problem of the CD loss because
+training the CBPUDA requires using contrastive learning and adversarial
+learning at the same time. Therefore, using the CD loss with the GS scheme
+overcomes the problem mentioned above to make features more compact for
+intra-class and distinguishable for inter-class. Experimental results express
+that the CBPUDA is superior to conventional UDA methods under consideration in
+this paper for UDA and fine-grained UDA tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HPFormer: Hyperspectral image <span class="highlight-title">prompt</span> object tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuedong Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral imagery contains abundant spectral information beyond the
+visible RGB bands, providing rich discriminative details about objects in a
+scene. Leveraging such data has the potential to enhance visual tracking
+performance. While prior hyperspectral trackers employ CNN or hybrid
+CNN-Transformer architectures, we propose a novel approach HPFormer on
+Transformers to capitalize on their powerful representation learning
+capabilities. The core of HPFormer is a Hyperspectral Hybrid Attention (HHA)
+module which unifies feature extraction and fusion within one component through
+token interactions. Additionally, a Transform Band Module (TBM) is introduced
+to selectively aggregate spatial details and spectral signatures from the full
+hyperspectral input for injecting informative target representations. Extensive
+experiments demonstrate state-of-the-art performance of HPFormer on benchmark
+NIR and VIS tracking datasets. Our work provides new insights into harnessing
+the strengths of transformers and hyperspectral fusion to advance robust object
+tracking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal
+  and Robust Vehicle Evasion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naufal Suryanto, Yongsu Kim, Harashta Tatimma Larasati, Hyoeun Kang, Thi-Thu-Huong Le, Yoonyoung Hong, Hunmin Yang, Se-Yoon Oh, Howon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial camouflage has garnered attention for its ability to attack
+object detectors from any viewpoint by covering the entire object's surface.
+However, universality and robustness in existing methods often fall short as
+the transferability aspect is often overlooked, thus restricting their
+application only to a specific target with limited performance. To address
+these challenges, we present Adversarial Camouflage for Transferable and
+Intensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage
+attack framework designed to generate universal and robust adversarial
+camouflage capable of concealing any 3D vehicle from detectors. Our framework
+incorporates innovative techniques to enhance universality and robustness: a
+refined texture rendering that enables common texture application to different
+vehicles without being constrained to a specific texture map, a novel stealth
+loss that renders the vehicle undetectable, and a smooth and camouflage loss to
+enhance the naturalness of the adversarial camouflage. Our extensive
+experiments on 15 different models show that ACTIVE consistently outperforms
+existing works on various public detectors, including the latest YOLOv7.
+Notably, our universality evaluations reveal promising transferability to other
+vehicle classes, tasks (segmentation models), and the real world, not just
+other vehicles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICCV 2023. Main Paper with Supplementary Material.
+  Project Page: https://islab-ai.github.io/active-iccv2023/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepbet: Fast brain extraction of T1-weighted MRI using Convolutional
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Fisch, Stefan Zumdick, Carlotta Barkhau, Daniel Emden, Jan Ernsting, Ramona Leenings, Kelvin Sarink, Nils R. Winter, Benjamin Risse, Udo Dannlowski, Tim Hahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain extraction in magnetic resonance imaging (MRI) data is an important
+segmentation step in many neuroimaging preprocessing pipelines. Image
+segmentation is one of the research fields in which deep learning had the
+biggest impact in recent years enabling high precision segmentation with
+minimal compute. Consequently, traditional brain extraction methods are now
+being replaced by deep learning-based methods. Here, we used a unique dataset
+comprising 568 T1-weighted (T1w) MR images from 191 different studies in
+combination with cutting edge deep learning methods to build a fast,
+high-precision brain extraction tool called deepbet. deepbet uses LinkNet, a
+modern UNet architecture, in a two stage prediction process. This increases its
+segmentation performance, setting a novel state-of-the-art performance during
+cross-validation with a median Dice score (DSC) of 99.0% on unseen datasets,
+outperforming current state of the art models (DSC = 97.8% and DSC = 97.9%).
+While current methods are more sensitive to outliers, resulting in Dice scores
+as low as 76.5%, deepbet manages to achieve a Dice score of > 96.9% for all
+samples. Finally, our model accelerates brain extraction by a factor of ~10
+compared to current methods, enabling the processing of one image in ~2 seconds
+on low level hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mutual Information-driven Triple Interaction Network for Efficient Image
+  Dehazing <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Shen, Zhong-Qiu Zhao, Yulun Zhang, Zhao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-stage architectures have exhibited efficacy in image dehazing, which
+usually decomposes a challenging task into multiple more tractable sub-tasks
+and progressively estimates latent hazy-free images. Despite the remarkable
+progress, existing methods still suffer from the following shortcomings: (1)
+limited exploration of frequency domain information; (2) insufficient
+information interaction; (3) severe feature redundancy. To remedy these issues,
+we propose a novel Mutual Information-driven Triple interaction Network
+(MITNet) based on spatial-frequency dual domain information and two-stage
+architecture. To be specific, the first stage, named amplitude-guided haze
+removal, aims to recover the amplitude spectrum of the hazy images for haze
+removal. And the second stage, named phase-guided structure refined, devotes to
+learning the transformation and refinement of the phase spectrum. To facilitate
+the information exchange between two stages, an Adaptive Triple Interaction
+Module (ATIM) is developed to simultaneously aggregate cross-domain,
+cross-scale, and cross-stage features, where the fused features are further
+used to generate content-adaptive dynamic filters so that applying them to
+enhance global context representation. In addition, we impose the mutual
+information minimization constraint on paired scale encoder and decoder
+features from both stages. Such an operation can effectively reduce information
+redundancy and enhance cross-stage feature complementarity. Extensive
+experiments on multiple public datasets exhibit that our MITNet performs
+superior performance with lower model complexity.The code and models are
+available at https://github.com/it-hao/MITNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PatchContrast: <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-train</span>ing for 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oren Shrout, Ori Nitzan, Yizhak Ben-Shabat, Ayellet Tal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting objects in the environment is a key challenge for
+autonomous vehicles. However, obtaining annotated data for detection is
+expensive and time-consuming. We introduce PatchContrast, a novel
+self-supervised point cloud pre-training framework for 3D object detection. We
+propose to utilize two levels of abstraction to learn discriminative
+representation from unlabeled data: proposal-level and patch-level. The
+proposal-level aims at localizing objects in relation to their surroundings,
+whereas the patch-level adds information about the internal connections between
+the object's components, hence distinguishing between different objects based
+on their individual components. We demonstrate how these levels can be
+integrated into self-supervised pre-training for various backbones to enhance
+the downstream 3D detection task. We show that our method outperforms existing
+state-of-the-art models on three commonly-used 3D detection datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based
+  Unsupervised Representation Learning Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Momojit Biswas, Himanshu Buckchash, Dilip K. Prasad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nearest neighbor (NN) sampling provides more semantic variations than
+pre-defined transformations for self-supervised learning (SSL) based image
+recognition problems. However, its performance is restricted by the quality of
+the support set, which holds positive samples for the contrastive loss. In this
+work, we show that the quality of the support set plays a crucial role in any
+nearest neighbor based method for SSL. We then provide a refined baseline
+(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we
+introduce pseudo nearest neighbors (pNN) to control the quality of the support
+set, wherein, rather than sampling the nearest neighbors, we sample in the
+vicinity of hard nearest neighbors by varying the magnitude of the resultant
+vector and employing a stochastic sampling strategy to improve the performance.
+Additionally, to stabilize the effects of uncertainty in NN-based learning, we
+employ a smooth-weight-update approach for training the proposed network.
+Evaluation of the proposed method on multiple public image recognition and
+medical image recognition datasets shows that it performs up to 8 percent
+better than the baseline nearest neighbor method, and is comparable to other
+previously proposed SSL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A One Stop 3D Target Reconstruction and multilevel Segmentation Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexiong Xu, Weikun Zhao, Zhiyan Tang, Xiangchao Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D object reconstruction and multilevel segmentation are fundamental to
+computer vision research. Existing algorithms usually perform 3D scene
+reconstruction and target objects segmentation independently, and the
+performance is not fully guaranteed due to the challenge of the 3D
+segmentation. Here we propose an open-source one stop 3D target reconstruction
+and multilevel segmentation framework (OSTRA), which performs segmentation on
+2D images, tracks multiple instances with segmentation labels in the image
+sequence, and then reconstructs labelled 3D objects or multiple parts with
+Multi-View Stereo (MVS) or RGBD-based 3D reconstruction methods. We extend
+object tracking and 3D reconstruction algorithms to support continuous
+segmentation labels to leverage the advances in the 2D image segmentation,
+especially the Segment-Anything Model (SAM) which uses the pretrained neural
+network without additional training for new scenes, for 3D object segmentation.
+OSTRA supports most popular 3D object models including point cloud, mesh and
+voxel, and achieves high performance for semantic segmentation, instance
+segmentation and part segmentation on several 3D datasets. It even surpasses
+the manual segmentation in scenes with complex structures and occlusions. Our
+method opens up a new avenue for reconstructing 3D targets embedded with rich
+multi-scale segmentation information in complex scenes. OSTRA is available from
+https://github.com/ganlab/OSTRA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How inter-rater variability relates to aleatoric and epistemic
+  uncertainty: a case study with deep learning-based paraspinal muscle
+  segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parinaz Roshanzamir, Hassan Rivaz, Joshua Ahn, Hamza Mirza, Neda Naghdi, Meagan Anstruther, Michele C. Battié, Maryse Fortin, Yiming Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in deep learning (DL) techniques have led to great
+performance improvement in medical image segmentation tasks, especially with
+the latest Transformer model and its variants. While labels from fusing
+multi-rater manual segmentations are often employed as ideal ground truths in
+DL model training, inter-rater variability due to factors such as training
+bias, image noise, and extreme anatomical variability can still affect the
+performance and uncertainty of the resulting algorithms. Knowledge regarding
+how inter-rater variability affects the reliability of the resulting DL
+algorithms, a key element in clinical deployment, can help inform better
+training data construction and DL models, but has not been explored
+extensively. In this paper, we measure aleatoric and epistemic uncertainties
+using test-time augmentation (TTA), test-time dropout (TTD), and deep ensemble
+to explore their relationship with inter-rater variability. Furthermore, we
+compare UNet and TransUNet to study the impacts of Transformers on model
+uncertainty with two label fusion strategies. We conduct a case study using
+multi-class paraspinal muscle segmentation from T2w MRIs. Our study reveals the
+interplay between inter-rater variability and uncertainties, affected by
+choices of label fusion strategies and DL models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in UNSURE MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Color-NeuS: Reconstructing Neural Implicit Surfaces with Color 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licheng Zhong, Lixin Yang, Kailin Li, Haoyu Zhen, Mei Han, Cewu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reconstruction of object surfaces from multi-view images or monocular
+video is a fundamental issue in computer vision. However, much of the recent
+research concentrates on reconstructing geometry through implicit or explicit
+methods. In this paper, we shift our focus towards reconstructing mesh in
+conjunction with color. We remove the view-dependent color from neural volume
+rendering while retaining volume rendering performance through a relighting
+network. Mesh is extracted from the signed distance function (SDF) network for
+the surface, and color for each surface vertex is drawn from the global color
+network. To evaluate our approach, we conceived a in hand object scanning task
+featuring numerous occlusions and dramatic shifts in lighting conditions. We've
+gathered several videos for this task, and the results surpass those of any
+existing methods capable of reconstructing mesh alongside color. Additionally,
+our method's performance was assessed using public datasets, including DTU,
+BlendedMVS, and OmniObject3D. The results indicated that our method performs
+well across all these datasets. Project page:
+https://colmar-zlicheng.github.io/color_neus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CEmb-SAM: Segment Anything Model with Condition Embedding for Joint
+  Learning from Heterogeneous <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongik Shin, Beomsuk Kim, Seungjun Baek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated segmentation of ultrasound images can assist medical experts with
+diagnostic and therapeutic procedures. Although using the common modality of
+ultrasound, one typically needs separate datasets in order to segment, for
+example, different anatomical structures or lesions with different levels of
+malignancy. In this paper, we consider the problem of jointly learning from
+heterogeneous datasets so that the model can improve generalization abilities
+by leveraging the inherent variability among datasets. We merge the
+heterogeneous datasets into one dataset and refer to each component dataset as
+a subgroup. We propose to train a single segmentation model so that the model
+can adapt to each sub-group. For robust segmentation, we leverage recently
+proposed Segment Anything model (SAM) in order to incorporate sub-group
+information into the model. We propose SAM with Condition Embedding block
+(CEmb-SAM) which encodes sub-group conditions and combines them with image
+embeddings from SAM. The conditional embedding block effectively adapts SAM to
+each image sub-group by incorporating dataset properties through learnable
+parameters for normalization. Experiments show that CEmb-SAM outperforms the
+baseline methods on ultrasound image segmentation for peripheral nerves and
+breast cancer. The experiments highlight the effectiveness of Cemb-SAM in
+learning from heterogeneous datasets in medical image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Features are All You Need for Image Retrieval and Reranking <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shihao Shao, Kaifeng Chen, Arjun Karpur, Qinghua Cui, Andre Araujo, Bingyi Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Utilizing a two-stage paradigm comprising of coarse image retrieval and
+precise reranking, a well-established image retrieval system is formed. It has
+been widely accepted for long time that local feature is imperative to the
+subsequent stage - reranking, but this requires sizeable storage and computing
+capacities. We, for the first time, propose an image retrieval paradigm
+leveraging global feature only to enable accurate and lightweight image
+retrieval for both coarse retrieval and reranking, thus the name - SuperGlobal.
+It consists of several plug-in modules that can be easily integrated into an
+already trained model, for both coarse retrieval and reranking stage. This
+series of approaches is inspired by the investigation into Generalized Mean
+(GeM) Pooling. Possessing these tools, we strive to defy the notion that local
+feature is essential for a high-performance image retrieval paradigm. Extensive
+experiments demonstrate substantial improvements compared to the state of the
+art in standard benchmarks. Notably, on the Revisited Oxford (ROxford)+1M Hard
+dataset, our single-stage results improve by 8.2% absolute, while our two-stage
+version gain reaches 3.7% with a strong 7568X speedup. Furthermore, when the
+full SuperGlobal is compared with the current single-stage state-of-the-art
+method, we achieve roughly 17% improvement with a minimal 0.005% time overhead.
+Code: https://github.com/ShihaoShao-GH/SuperGlobal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Channel-Wise Contrastive Learning for Learning with Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Kang, Sheng Liu, Huaxi Huang, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world datasets, noisy labels are pervasive. The challenge of learning
+with noisy labels (LNL) is to train a classifier that discerns the actual
+classes from given instances. For this, the model must identify features
+indicative of the authentic labels. While research indicates that genuine label
+information is embedded in the learned features of even inaccurately labeled
+data, it's often intertwined with noise, complicating its direct application.
+Addressing this, we introduce channel-wise contrastive learning (CWCL). This
+method distinguishes authentic label information from noise by undertaking
+contrastive learning across diverse channels. Unlike conventional instance-wise
+contrastive learning (IWCL), CWCL tends to yield more nuanced and resilient
+features aligned with the authentic labels. Our strategy is twofold: firstly,
+using CWCL to extract pertinent features to identify cleanly labeled samples,
+and secondly, progressively fine-tuning using these samples. Evaluations on
+several benchmark datasets validate our method's superiority over existing
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MixBCT: Towards Self-Adapting Backward-Compatible Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Liang, Shiliang Zhang, Yaowei Wang, Sheng Xiao, Kenli Li, Xiaoyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth of data, alongside advancements in model structures
+and loss functions, has necessitated the enhancement of image retrieval systems
+through the utilization of new models with superior feature embeddings.
+However, the expensive process of updating the old retrieval database by
+replacing embeddings poses a challenge. As a solution, backward-compatible
+training can be employed to avoid the necessity of updating old retrieval
+datasets. While previous methods achieved backward compatibility by aligning
+prototypes of the old model, they often overlooked the distribution of the old
+features, thus limiting their effectiveness when the old model's low quality
+leads to a weakly discriminative feature distribution. On the other hand,
+instance-based methods like L2 regression take into account the distribution of
+old features but impose strong constraints on the performance of the new model
+itself. In this paper, we propose MixBCT, a simple yet highly effective
+backward-compatible training method that serves as a unified framework for old
+models of varying qualities. Specifically, we summarize four constraints that
+are essential for ensuring backward compatibility in an ideal scenario, and we
+construct a single loss function to facilitate backward-compatible training.
+Our approach adaptively adjusts the constraint domain for new features based on
+the distribution of the old embeddings. We conducted extensive experiments on
+the large-scale face recognition datasets MS1Mv3 and IJB-C to verify the
+effectiveness of our method. The experimental results clearly demonstrate its
+superiority over previous methods. Code is available at
+https://github.com/yuleung/MixBCT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowing Where to Focus: Event-aware <span class="highlight-title">Transformer</span> for Video Grounding <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhyun Jang, Jungin Park, Jin Kim, Hyeongjun Kwon, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent DETR-based video grounding models have made the model directly predict
+moment timestamps without any hand-crafted components, such as a pre-defined
+proposal or non-maximum suppression, by learning moment queries. However, their
+input-agnostic moment queries inevitably overlook an intrinsic temporal
+structure of a video, providing limited positional information. In this paper,
+we formulate an event-aware dynamic moment query to enable the model to take
+the input-specific content and positional information of the video into
+account. To this end, we present two levels of reasoning: 1) Event reasoning
+that captures distinctive event units constituting a given video using a slot
+attention mechanism; and 2) moment reasoning that fuses the moment queries with
+a given sentence through a gated fusion transformer layer and learns
+interactions between the moment queries and video-sentence representations to
+predict moment timestamps. Extensive experiments demonstrate the effectiveness
+and efficiency of the event-aware dynamic moment queries, outperforming
+state-of-the-art approaches on several video grounding benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-aware Network for Aerial-to-Ground Image Synthesis <span class="chip">ICIP 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhyun Jang, Taeyong Song, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aerial-to-ground image synthesis is an emerging and challenging problem that
+aims to synthesize a ground image from an aerial image. Due to the highly
+different layout and object representation between the aerial and ground
+images, existing approaches usually fail to transfer the components of the
+aerial scene into the ground scene. In this paper, we propose a novel framework
+to explore the challenges by imposing enhanced structural alignment and
+semantic awareness. We introduce a novel semantic-attentive feature
+transformation module that allows to reconstruct the complex geographic
+structures by aligning the aerial feature to the ground layout. Furthermore, we
+propose semantic-aware loss functions by leveraging a pre-trained segmentation
+network. The network is enforced to synthesize realistic objects across various
+classes by separately calculating losses for different classes and balancing
+them. Extensive experiments including comparisons with previous methods and
+ablation studies show the effectiveness of the proposed framework both
+qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP 2021. Code is available at https://github.com/jinhyunj/SANet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-shot lip-based biometric authentication: extending behavioral
+  features with authentication phrase information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brando Koch, Ratko Grbić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lip-based biometric authentication (LBBA) is an authentication method based
+on a person's lip movements during speech in the form of video data captured by
+a camera sensor. LBBA can utilize both physical and behavioral characteristics
+of lip movements without requiring any additional sensory equipment apart from
+an RGB camera. State-of-the-art (SOTA) approaches use one-shot learning to
+train deep siamese neural networks which produce an embedding vector out of
+these features. Embeddings are further used to compute the similarity between
+an enrolled user and a user being authenticated. A flaw of these approaches is
+that they model behavioral features as style-of-speech without relation to what
+is being said. This makes the system vulnerable to video replay attacks of the
+client speaking any phrase. To solve this problem we propose a one-shot
+approach which models behavioral features to discriminate against what is being
+said in addition to style-of-speech. We achieve this by customizing the GRID
+dataset to obtain required triplets and training a siamese neural network based
+on 3D convolutions and recurrent neural network layers. A custom triplet loss
+for batch-wise hard-negative mining is proposed. Obtained results using an
+open-set protocol are 3.2% FAR and 3.8% FRR on the test set of the customized
+GRID dataset. Additional analysis of the results was done to quantify the
+influence and discriminatory power of behavioral and physical features for
+LBBA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 10 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radiomics-Informed Deep Learning for Classification of Atrial
+  Fibrillation Sub-Types from Left-Atrium CT Volumes <span class="chip">MICCAI23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihang Dai, Xiaomeng Li, Taihui Yu, Di Zhao, Jun Shen, Kwang-Ting Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atrial Fibrillation (AF) is characterized by rapid, irregular heartbeats, and
+can lead to fatal complications such as heart failure. The disease is divided
+into two sub-types based on severity, which can be automatically classified
+through CT volumes for disease screening of severe cases. However, existing
+classification approaches rely on generic radiomic features that may not be
+optimal for the task, whilst deep learning methods tend to over-fit to the
+high-dimensional volume inputs. In this work, we propose a novel
+radiomics-informed deep-learning method, RIDL, that combines the advantages of
+deep learning and radiomic approaches to improve AF sub-type classification.
+Unlike existing hybrid techniques that mostly rely on na\"ive feature
+concatenation, we observe that radiomic feature selection methods can serve as
+an information prior, and propose supplementing low-level deep neural network
+(DNN) features with locally computed radiomic features. This reduces DNN
+over-fitting and allows local variations between radiomic features to be better
+captured. Furthermore, we ensure complementary information is learned by deep
+and radiomic features by designing a novel feature de-correlation loss.
+Combined, our method addresses the limitations of deep learning and radiomic
+approaches and outperforms state-of-the-art radiomic, deep learning, and hybrid
+approaches, achieving 86.9% AUC for the AF sub-type classification task. Code
+is available at https://github.com/xmed-lab/RIDL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenGCD: Assisting Open World Recognition with Generalized Category
+  Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fulin Gao, Weimin Zhong, Zhixing Cao, Xin Peng, Zhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A desirable open world recognition (OWR) system requires performing three
+tasks: (1) Open set recognition (OSR), i.e., classifying the known (classes
+seen during training) and rejecting the unknown (unseen$/$novel classes)
+online; (2) Grouping and labeling these unknown as novel known classes; (3)
+Incremental learning (IL), i.e., continual learning these novel classes and
+retaining the memory of old classes. Ideally, all of these steps should be
+automated. However, existing methods mostly assume that the second task is
+completely done manually. To bridge this gap, we propose OpenGCD that combines
+three key ideas to solve the above problems sequentially: (a) We score the
+origin of instances (unknown or specifically known) based on the uncertainty of
+the classifier's prediction; (b) For the first time, we introduce generalized
+category discovery (GCD) techniques in OWR to assist humans in grouping
+unlabeled data; (c) For the smooth execution of IL and GCD, we retain an equal
+number of informative exemplars for each class with diversity as the goal.
+Moreover, we present a new performance evaluation metric for GCD called
+harmonic clustering accuracy. Experiments on two standard classification
+benchmarks and a challenging dataset demonstrate that OpenGCD not only offers
+excellent compatibility but also substantially outperforms other baselines.
+Code: https://github.com/Fulin-Gao/OpenGCD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBA: Improving Online Continual Learning via Continual Bias Adaptor <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanziang Wang, Renzhen Wang, Yichen Wu, Xixi Jia, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online continual learning (CL) aims to learn new knowledge and consolidate
+previously learned knowledge from non-stationary data streams. Due to the
+time-varying training setting, the model learned from a changing distribution
+easily forgets the previously learned knowledge and biases toward the newly
+received task. To address this problem, we propose a Continual Bias Adaptor
+(CBA) module to augment the classifier network to adapt to catastrophic
+distribution change during training, such that the classifier network is able
+to learn a stable consolidation of previously learned tasks. In the testing
+stage, CBA can be removed which introduces no additional computation cost and
+memory overhead. We theoretically reveal the reason why the proposed method can
+effectively alleviate catastrophic distribution shifts, and empirically
+demonstrate its effectiveness through extensive experiments based on four
+rehearsal-based baselines and three public continual learning benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchy Flow For High-Fidelity Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichen Fan, Jinghuan Chen, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-to-image (I2I) translation comprises a wide spectrum of tasks. Here we
+divide this problem into three levels: strong-fidelity translation,
+normal-fidelity translation, and weak-fidelity translation, indicating the
+extent to which the content of the original image is preserved. Although
+existing methods achieve good performance in weak-fidelity translation, they
+fail to fully preserve the content in both strong- and normal-fidelity tasks,
+e.g. sim2real, style transfer and low-level vision. In this work, we propose
+Hierarchy Flow, a novel flow-based model to achieve better content preservation
+during translation. Specifically, 1) we first unveil the drawbacks of standard
+flow-based models when applied to I2I translation. 2) Next, we propose a new
+design, namely hierarchical coupling for reversible feature transformation and
+multi-scale modeling, to constitute Hierarchy Flow. 3) Finally, we present a
+dedicated aligned-style loss for a better trade-off between content
+preservation and stylization during translation. Extensive experiments on a
+wide range of I2I translation benchmarks demonstrate that our approach achieves
+state-of-the-art performance, with convincing advantages in both strong- and
+normal-fidelity tasks. Code and models will be at
+https://github.com/WeichenFan/HierarchyFlow.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2207.01909</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Michigan Robotics Undergraduate Curriculum: Defining the Discipline
+  of Robotics for Equity and Excellence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Odest Chadwicke Jenkins, Jessy Grizzle, Ella Atkins, Leia Stirling, Elliott Rouse, Mark Guzdial, Damen Provost, Kimberly Mann, Joanna Millunchick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Robotics Major at the University of Michigan was successfully launched in
+the 2022-23 academic year as an innovative step forward to better serve
+students, our communities, and our society. Building on our guiding principle
+of "Robotics with Respect" and our larger Robotics Pathways model, the Michigan
+Robotics Major was designed to define robotics as a true academic discipline
+with both equity and excellence as our highest priorities. Understanding that
+talent is equally distributed but opportunity is not, the Michigan Robotics
+Major has embraced an adaptable curriculum that is accessible through a
+diversity of student pathways and enables successful and sustained career-long
+participation in robotics, AI, and automation professions. The results after
+our planning efforts (2019-22) and first academic year (2022-23) have been
+highly encouraging: more than 100 students declared Robotics as their major,
+completion of the Robotics major by our first two graduates, soaring
+enrollments in our Robotics classes, thriving partnerships with Historically
+Black Colleges and Universities. This document provides our original curricular
+proposal for the Robotics Undergraduate Program at the University of Michigan,
+submitted to the Michigan Association of State Universities in April 2022 and
+approved in June 2022. The dissemination of our program design is in the spirit
+of continued growth for higher education towards realizing equity and
+excellence.
+  The most recent version of this document is also available on Google Docs
+through this link: https://ocj.me/robotics_major
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>49 pages, approximately 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Lightweight Hierarchical Vision <span class="highlight-title">Transformer</span>s for Efficient
+  Visual Tracking <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Kang, Xin Chen, Dong Wang, Houwen Peng, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based visual trackers have demonstrated significant progress
+owing to their superior modeling capabilities. However, existing trackers are
+hampered by low speed, limiting their applicability on devices with limited
+computational power. To alleviate this problem, we propose HiT, a new family of
+efficient tracking models that can run at high speed on different devices while
+retaining high performance. The central idea of HiT is the Bridge Module, which
+bridges the gap between modern lightweight transformers and the tracking
+framework. The Bridge Module incorporates the high-level information of deep
+features into the shallow large-resolution features. In this way, it produces
+better features for the tracking head. We also propose a novel dual-image
+position encoding technique that simultaneously encodes the position
+information of both the search region and template images. The HiT model
+achieves promising speed with competitive performance. For instance, it runs at
+61 frames per second (fps) on the Nvidia Jetson AGX edge device. Furthermore,
+HiT attains 64.6% AUC on the LaSOT benchmark, surpassing all previous efficient
+trackers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orthogonal Temporal Interpolation for Zero-Shot Video Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zhu, Junbao Zhuo, Bin Ma, Jiajia Geng, Xiaoming Wei, Xiaolin Wei, Shuhui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot video recognition (ZSVR) is a task that aims to recognize video
+categories that have not been seen during the model training process. Recently,
+vision-language models (VLMs) pre-trained on large-scale image-text pairs have
+demonstrated impressive transferability for ZSVR. To make VLMs applicable to
+the video domain, existing methods often use an additional temporal learning
+module after the image-level encoder to learn the temporal relationships among
+video frames. Unfortunately, for video from unseen categories, we observe an
+abnormal phenomenon where the model that uses spatial-temporal feature performs
+much worse than the model that removes temporal learning module and uses only
+spatial feature. We conjecture that improper temporal modeling on video
+disrupts the spatial feature of the video. To verify our hypothesis, we propose
+Feature Factorization to retain the orthogonal temporal feature of the video
+and use interpolation to construct refined spatial-temporal feature. The model
+using appropriately refined spatial-temporal feature performs better than the
+one using only spatial feature, which verifies the effectiveness of the
+orthogonal temporal feature for the ZSVR task. Therefore, an Orthogonal
+Temporal Interpolation module is designed to learn a better refined
+spatial-temporal video feature during training. Additionally, a Matching Loss
+is introduced to improve the quality of the orthogonal temporal feature. We
+propose a model called OTI for ZSVR by employing orthogonal temporal
+interpolation and the matching loss based on VLMs. The ZSVR accuracies on
+popular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI
+outperforms the previous state-of-the-art method by a clear margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness Stress Testing in Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mobarakol Islam, Zeju Li, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have shown impressive performance for image-based
+disease detection. Performance is commonly evaluated through clinical
+validation on independent test sets to demonstrate clinically acceptable
+accuracy. Reporting good performance metrics on test sets, however, is not
+always a sufficient indication of the generalizability and robustness of an
+algorithm. In particular, when the test data is drawn from the same
+distribution as the training data, the iid test set performance can be an
+unreliable estimate of the accuracy on new data. In this paper, we employ
+stress testing to assess model robustness and subgroup performance disparities
+in disease detection models. We design progressive stress testing using five
+different bidirectional and unidirectional image perturbations with six
+different severity levels. As a use case, we apply stress tests to measure the
+robustness of disease detection models for chest X-ray and skin lesion images,
+and demonstrate the importance of studying class and domain-specific model
+behaviour. Our experiments indicate that some models may yield more robust and
+equitable performance than others. We also find that pretraining
+characteristics play an important role in downstream robustness. We conclude
+that progressive stress testing is a viable and important tool and should
+become standard practice in the clinical validation of image-based disease
+detection models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustified ANNs Reveal Wormholes Between Human Category Percepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Gaziv, Michael J. Lee, James J. DiCarlo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual object category reports of artificial neural networks (ANNs) are
+notoriously sensitive to tiny, adversarial image perturbations. Because human
+category reports (aka human percepts) are thought to be insensitive to those
+same small-norm perturbations -- and locally stable in general -- this argues
+that ANNs are incomplete scientific models of human visual perception.
+Consistent with this, we show that when small-norm image perturbations are
+generated by standard ANN models, human object category percepts are indeed
+highly stable. However, in this very same "human-presumed-stable" regime, we
+find that robustified ANNs reliably discover low-norm image perturbations that
+strongly disrupt human percepts. These previously undetectable human perceptual
+disruptions are massive in amplitude, approaching the same level of sensitivity
+seen in robustified ANNs. Further, we show that robustified ANNs support
+precise perceptual state interventions: they guide the construction of low-norm
+image perturbations that strongly alter human category percepts toward specific
+prescribed percepts. These observations suggest that for arbitrary starting
+points in image space, there exists a set of nearby "wormholes", each leading
+the subject from their current category perceptual state into a semantically
+very different state. Moreover, contemporary ANN models of biological visual
+processing are now accurate enough to consistently guide us to those portals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>*Equal contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in
+  Entropy Minimization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jungsoo Lee, Debasmit Das, Jaegul Choo, Sungha Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) methods, which generally rely on the model's
+predictions (e.g., entropy minimization) to adapt the source pretrained model
+to the unlabeled target domain, suffer from noisy signals originating from 1)
+incorrect or 2) open-set predictions. Long-term stable adaptation is hampered
+by such noisy signals, so training models without such error accumulation is
+crucial for practical TTA. To address these issues, including open-set TTA, we
+propose a simple yet effective sample selection method inspired by the
+following crucial empirical finding. While entropy minimization compels the
+model to increase the probability of its predicted label (i.e., confidence
+values), we found that noisy samples rather show decreased confidence values.
+To be more specific, entropy minimization attempts to raise the confidence
+values of an individual sample's prediction, but individual confidence values
+may rise or fall due to the influence of signals from numerous other
+predictions (i.e., wisdom of crowds). Due to this fact, noisy signals
+misaligned with such 'wisdom of crowds', generally found in the correct
+signals, fail to raise the individual confidence values of wrong samples,
+despite attempts to increase them. Based on such findings, we filter out the
+samples whose confidence values are lower in the adapted model than in the
+original model, as they are likely to be noisy. Our method is widely applicable
+to existing TTA methods and improves their long-term adaptation performance in
+both image classification (e.g., 49.4% reduced error rates with TENT) and
+semantic segmentation (e.g., 11.7% gain in mIoU with TENT).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape-Graph Matching Network (SGM-net): Registration for Statistical
+  Shape Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenyuan Liang, Mauricio Pamplona Segundo, Sathyanarayanan N. Aakur, Sudeep Sarkar, Anuj Srivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on the statistical analysis of shapes of data objects
+called shape graphs, a set of nodes connected by articulated curves with
+arbitrary shapes. A critical need here is a constrained registration of points
+(nodes to nodes, edges to edges) across objects. This, in turn, requires
+optimization over the permutation group, made challenging by differences in
+nodes (in terms of numbers, locations) and edges (in terms of shapes,
+placements, and sizes) across objects. This paper tackles this registration
+problem using a novel neural-network architecture and involves an unsupervised
+loss function developed using the elastic shape metric for curves. This
+architecture results in (1) state-of-the-art matching performance and (2) an
+order of magnitude reduction in the computational cost relative to baseline
+approaches. We demonstrate the effectiveness of the proposed approach using
+both simulated data and real-world 2D and 3D shape graphs. Code and data will
+be made publicly available after review to foster research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Camera Based mmWave Beam Prediction: Towards Multi-Candidate Real-World
+  Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gouranga Charan, Muhammad Alrabeiah, Tawfik Osman, Ahmed Alkhateeb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging sensory information to aid the millimeter-wave (mmWave) and
+sub-terahertz (sub-THz) beam selection process is attracting increasing
+interest. This sensory data, captured for example by cameras at the
+basestations, has the potential of significantly reducing the beam sweeping
+overhead and enabling highly-mobile applications. The solutions developed so
+far, however, have mainly considered single-candidate scenarios, i.e.,
+scenarios with a single candidate user in the visual scene, and were evaluated
+using synthetic datasets. To address these limitations, this paper extensively
+investigates the sensing-aided beam prediction problem in a real-world
+multi-object vehicle-to-infrastructure (V2I) scenario and presents a
+comprehensive machine learning-based framework. In particular, this paper
+proposes to utilize visual and positional data to predict the optimal beam
+indices as an alternative to the conventional beam sweeping approaches. For
+this, a novel user (transmitter) identification solution has been developed, a
+key step in realizing sensing-aided multi-candidate and multi-user beam
+prediction solutions. The proposed solutions are evaluated on the large-scale
+real-world DeepSense $6$G dataset. Experimental results in realistic V2I
+communication scenarios indicate that the proposed solutions achieve close to
+$100\%$ top-5 beam prediction accuracy for the scenarios with single-user and
+close to $95\%$ top-5 beam prediction accuracy for multi-candidate scenarios.
+Furthermore, the proposed approach can identify the probable transmitting
+candidate with more than $93\%$ accuracy across the different scenarios. This
+highlights a promising approach for nearly eliminating the beam training
+overhead in mmWave/THz communication systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset and code files are available on the DeepSense 6G website
+  https://deepsense6g.net/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Source-free Domain Adaptive Human Pose Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qucheng Peng, Ce Zheng, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Pose Estimation (HPE) is widely used in various fields, including
+motion analysis, healthcare, and virtual reality. However, the great expenses
+of labeled real-world datasets present a significant challenge for HPE. To
+overcome this, one approach is to train HPE models on synthetic datasets and
+then perform domain adaptation (DA) on real-world data. Unfortunately, existing
+DA methods for HPE neglect data privacy and security by using both source and
+target data in the adaptation process. To this end, we propose a new task,
+named source-free domain adaptive HPE, which aims to address the challenges of
+cross-domain learning of HPE without access to source data during the
+adaptation process. We further propose a novel framework that consists of three
+models: source model, intermediate model, and target model, which explores the
+task from both source-protect and target-relevant perspectives. The
+source-protect module preserves source information more effectively while
+resisting noise, and the target-relevant module reduces the sparsity of spatial
+representations by building a novel spatial probability space, and
+pose-specific contrastive learning and information maximization are proposed on
+the basis of this space. Comprehensive experiments on several domain adaptive
+HPE benchmarks show that the proposed method outperforms existing approaches by
+a considerable margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Developability Approximation for Neural Implicits through Rank
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratheba Selvaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developability refers to the process of creating a surface without any
+tearing or shearing from a two-dimensional plane. It finds practical
+applications in the fabrication industry. An essential characteristic of a
+developable 3D surface is its zero Gaussian curvature, which means that either
+one or both of the principal curvatures are zero. This paper introduces a
+method for reconstructing an approximate developable surface from a neural
+implicit surface. The central idea of our method involves incorporating a
+regularization term that operates on the second-order derivatives of the neural
+implicits, effectively promoting zero Gaussian curvature. Implicit surfaces
+offer the advantage of smoother deformation with infinite resolution,
+overcoming the high polygonal constraints of state-of-the-art methods using
+discrete representations. We draw inspiration from the properties of surface
+curvature and employ rank minimization techniques derived from compressed
+sensing. Experimental results on both developable and non-developable surfaces,
+including those affected by noise, validate the generalizability of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability
+  for Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Peirong Ning, Jiayu Yang, Yongqi Zhai, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, transformers are trending as replacements for CNNs in vision tasks,
+including compression. This trend compels us to question the inherent
+limitations of CNNs compared to transformers and to explore if CNNs can be
+enhanced to achieve the same or even better performance than transformers. We
+want to design a pure CNN based model for compression as most devices are
+optimized for CNNs well. In our analysis, we find that the key strengths of
+transformers lie in their dynamic weights and large receptive fields. To enable
+CNNs with such properties, we propose a novel transform module with large
+receptive filed learning and self-conditioned adaptability for learned image
+compression, named SLIC. Specifically, we enlarge the receptive field of
+depth-wise convolution with suitable complexity and generate the weights
+according to given conditions. In addition, we also investigate the
+self-conditioned factor for channels. To prove the effectiveness of our
+proposed transform module, we equip it with existing entropy models ChARM,
+SCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and
+SLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and
+SLIC-SWAtten have significant improvements over corresponding baselines and
+achieve SOTA performances with suitable complexity on 5 test datasets (Kodak,
+Tecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at
+https://github.com/JiangWeibeta/SLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and
+  Generalizable Neural Surface Reconstruction <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10003v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10003v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luoyuan Xu, Tao Guan, Yuesong Wang, Wenkai Liu, Zhaojie Zeng, Junle Wang, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is an emerging effort to combine the two popular 3D frameworks using
+Multi-View Stereo (MVS) and Neural Implicit Surfaces (NIS) with a specific
+focus on the few-shot / sparse view setting. In this paper, we introduce a
+novel integration scheme that combines the multi-view stereo with neural signed
+distance function representations, which potentially overcomes the limitations
+of both methods. MVS uses per-view depth estimation and cross-view fusion to
+generate accurate surfaces, while NIS relies on a common coordinate volume.
+Based on this strategy, we propose to construct per-view cost frustum for finer
+geometry estimation, and then fuse cross-view frustums and estimate the
+implicit signed distance functions to tackle artifacts that are due to noise
+and holes in the produced surface reconstruction. We further apply a cascade
+frustum fusion strategy to effectively captures global-local information and
+structural consistency. Finally, we apply cascade sampling and a
+pseudo-geometric loss to foster stronger integration between the two
+architectures. Extensive experiments demonstrate that our method reconstructs
+robust surfaces and outperforms existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point Cloud Registration for LiDAR and Photogrammetric Data: a Critical
+  Synthesis and Performance Analysis on Classic and Deep Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ningli Xu, Rongjun Qin, Shuang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in computer vision and deep learning have shown promising
+performance in estimating rigid/similarity transformation between unregistered
+point clouds of complex objects and scenes. However, their performances are
+mostly evaluated using a limited number of datasets from a single sensor (e.g.
+Kinect or RealSense cameras), lacking a comprehensive overview of their
+applicability in photogrammetric 3D mapping scenarios. In this work, we provide
+a comprehensive review of the state-of-the-art (SOTA) point cloud registration
+methods, where we analyze and evaluate these methods using a diverse set of
+point cloud data from indoor to satellite sources. The quantitative analysis
+allows for exploring the strengths, applicability, challenges, and future
+trends of these methods. In contrast to existing analysis works that introduce
+point cloud registration as a holistic process, our experimental analysis is
+based on its inherent two-step process to better comprehend these approaches
+including feature/keypoint-based initial coarse registration and dense fine
+registration through cloud-to-cloud (C2C) optimization. More than ten methods,
+including classic hand-crafted, deep-learning-based feature correspondence, and
+robust C2C methods were tested. We observed that the success rate of most of
+the algorithms are fewer than 40% over the datasets we tested and there are
+still are large margin of improvement upon existing algorithms concerning 3D
+sparse corresopondence search, and the ability to register point clouds with
+complex geometry and occlusions. With the evaluated statistics on three
+datasets, we conclude the best-performing methods for each step and provide our
+recommendations, and outlook future efforts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Restore and Transfer for Multi-label Class-Incremental
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Dong, Haoyu Luo, Yuhang He, Xing Wei, Yihong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current class-incremental learning research mainly focuses on single-label
+classification tasks while multi-label class-incremental learning (MLCIL) with
+more practical application scenarios is rarely studied. Although there have
+been many anti-forgetting methods to solve the problem of catastrophic
+forgetting in class-incremental learning, these methods have difficulty in
+solving the MLCIL problem due to label absence and information dilution. In
+this paper, we propose a knowledge restore and transfer (KRT) framework for
+MLCIL, which includes a dynamic pseudo-label (DPL) module to restore the old
+class knowledge and an incremental cross-attention(ICA) module to save
+session-specific knowledge and transfer old class knowledge to the new model
+sufficiently. Besides, we propose a token loss to jointly optimize the
+incremental cross-attention module. Experimental results on MS-COCO and PASCAL
+VOC datasets demonstrate the effectiveness of our method for improving
+recognition performance and mitigating forgetting on multi-label
+class-incremental learning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inadequately <span class="highlight-title">Pre-train</span>ed Models are Better Feature Extractors <span class="chip">ICCV'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andong Deng, Xingjian Li, Di Hu, Tianyang Wang, Haoyi Xiong, Chengzhong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training has been a popular learning paradigm in deep learning era,
+especially in annotation-insufficient scenario. Better ImageNet pre-trained
+models have been demonstrated, from the perspective of architecture, by
+previous research to have better transferability to downstream tasks. However,
+in this paper, we found that during the same pre-training process, models at
+middle epochs, which is inadequately pre-trained, can outperform fully trained
+models when used as feature extractors (FE), while the fine-tuning (FT)
+performance still grows with the source performance. This reveals that there is
+not a solid positive correlation between top-1 accuracy on ImageNet and the
+transferring result on target data. Based on the contradictory phenomenon
+between FE and FT that better feature extractor fails to be fine-tuned better
+accordingly, we conduct comprehensive analyses on features before softmax layer
+to provide insightful explanations. Our discoveries suggest that, during
+pre-training, models tend to first learn spectral components corresponding to
+large singular values and the residual components contribute more when
+fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV'2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Dynamic Monocular Videos for Dynamic View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng You, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,
+synthesizing novel views for free viewpoints given a monocular video of a
+dynamic scene captured by a moving camera, mainly lies in accurately modeling
+the dynamic objects of a scene using limited 2D frames, each with a varying
+timestamp and viewpoint. Existing methods usually require pre-processed 2D
+optical flow and depth maps by off-the-shelf methods to supervise the network,
+making them suffer from the inaccuracy of the pre-processed supervision and the
+ambiguity when lifting the 2D information to 3D. In this paper, we tackle this
+challenge in an unsupervised fashion. Specifically, we decouple the motion of
+the dynamic objects into object motion and camera motion, respectively
+regularized by proposed unsupervised surface consistency and patch-based
+multi-view constraints. The former enforces the 3D geometric surfaces of moving
+objects to be consistent over time, while the latter regularizes their
+appearances to be consistent across different viewpoints. Such a fine-grained
+motion formulation can alleviate the learning difficulty for the network, thus
+enabling it to produce not only novel views with higher quality but also more
+accurate scene flows and depth than existing methods requiring extra
+supervision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TeViS:Translating Text Synopses to Video Storyboards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00135v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00135v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Gu, Yuchong Sun, Feiyue Ni, Shizhe Chen, Xihua Wang, Ruihua Song, Boyuan Li, Xiang Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A video storyboard is a roadmap for video creation which consists of
+shot-by-shot images to visualize key plots in a text synopsis. Creating video
+storyboards, however, remains challenging which not only requires cross-modal
+association between high-level texts and images but also demands long-term
+reasoning to make transitions smooth across shots. In this paper, we propose a
+new task called Text synopsis to Video Storyboard (TeViS) which aims to
+retrieve an ordered sequence of images as the video storyboard to visualize the
+text synopsis. We construct a MovieNet-TeViS dataset based on the public
+MovieNet dataset. It contains 10K text synopses each paired with keyframes
+manually selected from corresponding movies by considering both relevance and
+cinematic coherence. To benchmark the task, we present strong CLIP-based
+baselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images
+into a joint embedding space and uses vector quantization (VQ) to improve the
+visual representation. Then, it auto-regressively generates a sequence of
+visual features for retrieval and ordering. Experimental results demonstrate
+that VQ-Trans significantly outperforms prior methods and the CLIP-based
+baselines. Nevertheless, there is still a large gap compared to human
+performance suggesting room for promising future work. The code and data are
+available at: \url{https://ruc-aimind.github.io/projects/TeViS/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17723v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17723v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunghyeon Seo, Yeonjin Chang, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis
+with its remarkable quality of rendered images and simple architecture.
+Although NeRF has been developed in various directions improving continuously
+its performance, the necessity of a dense set of multi-view images still exists
+as a stumbling block to progress for practical application. In this work, we
+propose FlipNeRF, a novel regularization method for few-shot novel view
+synthesis by utilizing our proposed flipped reflection rays. The flipped
+reflection rays are explicitly derived from the input ray directions and
+estimated normal vectors, and play a role of effective additional training rays
+while enabling to estimate more accurate surface normals and learn the 3D
+geometry effectively. Since the surface normal and the scene depth are both
+derived from the estimated densities along a ray, the accurate surface normal
+leads to more exact depth estimation, which is a key factor for few-shot novel
+view synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss
+and Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more
+reliable outputs with reducing floating artifacts effectively across the
+different scene structures, and enhance the feature-level consistency between
+the pair of the rays cast toward the photo-consistent pixels without any
+additional feature extractor, respectively. Our FlipNeRF achieves the SOTA
+performance on the multiple benchmarks across all the scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RemoteNet: Remote Sensing Image Segmentation Network based on
+  Global-Local Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satyawant Kumar, Abhishek Kumar, Dong-Gyu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remotely captured images possess an immense scale and object appearance
+variability due to the complex scene. It becomes challenging to capture the
+underlying attributes in the global and local context for their segmentation.
+Existing networks struggle to capture the inherent features due to the
+cluttered background. To address these issues, we propose a remote sensing
+image segmentation network, RemoteNet, for semantic segmentation of remote
+sensing images. We capture the global and local features by leveraging the
+benefits of the transformer and convolution mechanisms. RemoteNet is an
+encoder-decoder design that uses multi-scale features. We construct an
+attention map module to generate channel-wise attention scores for fusing these
+features. We construct a global-local transformer block (GLTB) in the decoder
+network to support learning robust representations during a decoding phase.
+Further, we designed a feature refinement module to refine the fused output of
+the shallow stage encoder feature and the deepest GLTB feature of the decoder.
+Experimental findings on the two public datasets show the effectiveness of the
+proposed RemoteNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2D3D-MATR: 2D-3D Matching <span class="highlight-title">Transformer</span> for Detection-free Registration
+  between Images and Point Clouds <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05667v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05667v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhao Li, Zheng Qin, Zhirui Gao, Renjiao Yi, Chenyang Zhu, Yulan Guo, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The commonly adopted detect-then-match approach to registration finds
+difficulties in the cross-modality cases due to the incompatible keypoint
+detection and inconsistent feature description. We propose, 2D3D-MATR, a
+detection-free method for accurate and robust registration between images and
+point clouds. Our method adopts a coarse-to-fine pipeline where it first
+computes coarse correspondences between downsampled patches of the input image
+and the point cloud and then extends them to form dense correspondences between
+pixels and points within the patch region. The coarse-level patch matching is
+based on transformer which jointly learns global contextual constraints with
+self-attention and cross-modality correlations with cross-attention. To resolve
+the scale ambiguity in patch matching, we construct a multi-scale pyramid for
+each image patch and learn to find for each point patch the best matching image
+patch at a proper resolution level. Extensive experiments on two public
+benchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art
+P2-Net by around $20$ percentage points on inlier ratio and over $10$ points on
+registration recall. Our code and models are available at
+https://github.com/minhaolee/2D3DMATR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InterTracker: Discovering and Tracking General Objects Interacting with
+  Hands in the Wild <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyan Shao, Qi Ye, Wenhan Luo, Kaihao Zhang, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding human interaction with objects is an important research topic
+for embodied Artificial Intelligence and identifying the objects that humans
+are interacting with is a primary problem for interaction understanding.
+Existing methods rely on frame-based detectors to locate interacting objects.
+However, this approach is subjected to heavy occlusions, background clutter,
+and distracting objects. To address the limitations, in this paper, we propose
+to leverage spatio-temporal information of hand-object interaction to track
+interactive objects under these challenging cases. Without prior knowledge of
+the general objects to be tracked like object tracking problems, we first
+utilize the spatial relation between hands and objects to adaptively discover
+the interacting objects from the scene. Second, the consistency and continuity
+of the appearance of objects between successive frames are exploited to track
+the objects. With this tracking formulation, our method also benefits from
+training on large-scale general object-tracking datasets. We further curate a
+video-level hand-object interaction dataset for testing and evaluation from
+100DOH. The quantitative results demonstrate that our proposed method
+outperforms the state-of-the-art methods. Specifically, in scenes with
+continuous interaction with different objects, we achieve an impressive
+improvement of about 10% as evaluated using the Average Precision (AP) metric.
+Our qualitative findings also illustrate that our method can produce more
+continuous trajectories for interacting objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through
+  Unexploitable Data with Learnable Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09241v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09241v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan Jiang, Yunfeng Diao, He Wang, Jianxin Sun, Meng Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safeguarding data from unauthorized exploitation is vital for privacy and
+security, especially in recent rampant research in security breach such as
+adversarial/membership attacks. To this end, \textit{unlearnable examples}
+(UEs) have been recently proposed as a compelling protection, by adding
+imperceptible perturbation to data so that models trained on them cannot
+classify them accurately on original clean distribution. Unfortunately, we find
+UEs provide a false sense of security, because they cannot stop unauthorized
+users from utilizing other unprotected data to remove the protection, by
+turning unlearnable data into learnable again. Motivated by this observation,
+we formally define a new threat by introducing \textit{learnable unauthorized
+examples} (LEs) which are UEs with their protection removed. The core of this
+approach is a novel purification process that projects UEs onto the manifold of
+LEs. This is realized by a new joint-conditional diffusion model which denoises
+UEs conditioned on the pixel and perceptual similarity between UEs and LEs.
+Extensive experiments demonstrate that LE delivers state-of-the-art countering
+performance against both supervised UEs and unsupervised UEs in various
+scenarios, which is the first generalizable countermeasure to UEs across
+supervised learning and unsupervised learning. Our code is available at
+\url{https://github.com/jiangw-0/LE_JCDP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HumanMAC: Masked Motion Completion for Human Motion Prediction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03665v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03665v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling-Hao Chen, Jiawei Zhang, Yewen Li, Yiren Pang, Xiaobo Xia, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction is a classical problem in computer vision and
+computer graphics, which has a wide range of practical applications. Previous
+effects achieve great empirical performance based on an encoding-decoding
+style. The methods of this style work by first encoding previous motions to
+latent representations and then decoding the latent representations into
+predicted motions. However, in practice, they are still unsatisfactory due to
+several issues, including complicated loss constraints, cumbersome training
+processes, and scarce switch of different categories of motions in prediction.
+In this paper, to address the above issues, we jump out of the foregoing style
+and propose a novel framework from a new perspective. Specifically, our
+framework works in a masked completion fashion. In the training stage, we learn
+a motion diffusion model that generates motions from random noise. In the
+inference stage, with a denoising procedure, we make motion prediction
+conditioning on observed motions to output more continuous and controllable
+predictions. The proposed framework enjoys promising algorithmic properties,
+which only needs one loss in optimization and is trained in an end-to-end
+manner. Additionally, it accomplishes the switch of different categories of
+motions effectively, which is significant in realistic tasks, e.g., the
+animation task. Comprehensive experiments on benchmarks confirm the superiority
+of the proposed framework. The project page is available at
+https://lhchen.top/Human-MAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAD-Estate: Large-scale CAD Model Annotation in RGB Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09011v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09011v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevis-Kokitsi Maninis, Stefan Popov, Matthias Nießner, Vittorio Ferrari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method for annotating videos of complex multi-object scenes with
+a globally-consistent 3D representation of the objects. We annotate each object
+with a CAD model from a database, and place it in the 3D coordinate frame of
+the scene with a 9-DoF pose transformation. Our method is semi-automatic and
+works on commonly-available RGB videos, without requiring a depth sensor. Many
+steps are performed automatically, and the tasks performed by humans are
+simple, well-specified, and require only limited reasoning in 3D. This makes
+them feasible for crowd-sourcing and has allowed us to construct a large-scale
+dataset by annotating real-estate videos from YouTube. Our dataset CAD-Estate
+offers 101k instances of 12k unique CAD models placed in the 3D representations
+of 20k videos. In comparison to Scan2CAD, the largest existing dataset with CAD
+model annotations on real scenes, CAD-Estate has 7x more instances and 4x more
+unique CAD models. We showcase the benefits of pre-training a Mask2CAD model on
+CAD-Estate for the task of automatic 3D object reconstruction and pose
+estimation, demonstrating that it leads to performance improvements on the
+popular Scan2CAD benchmark. The dataset is available at
+https://github.com/google-research/cad-estate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/google-research/cad-estate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion<span class="highlight-title">BERT</span>: A Unified Perspective on Learning Human Motion
+  Representations <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06551v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06551v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Xiaoxuan Ma, Zhaoyang Liu, Libin Liu, Wayne Wu, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified perspective on tackling various human-centric video
+tasks by learning human motion representations from large-scale and
+heterogeneous data resources. Specifically, we propose a pretraining stage in
+which a motion encoder is trained to recover the underlying 3D motion from
+noisy partial 2D observations. The motion representations acquired in this way
+incorporate geometric, kinematic, and physical knowledge about human motion,
+which can be easily transferred to multiple downstream tasks. We implement the
+motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)
+neural network. It could capture long-range spatio-temporal relationships among
+the skeletal joints comprehensively and adaptively, exemplified by the lowest
+3D pose estimation error so far when trained from scratch. Furthermore, our
+proposed framework achieves state-of-the-art performance on all three
+downstream tasks by simply finetuning the pretrained motion encoder with a
+simple regression head (1-2 layers), which demonstrates the versatility of the
+learned motion representations. Code and models are available at
+https://motionbert.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12398v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12398v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anahita Nekoozadeh, Mohammad Reza Ahmadzadeh, Zahra Mardani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have achieved widespread success in computer vision. At their
+heart, there is a Self-Attention (SA) mechanism, an inductive bias that
+associates each token in the input with every other token through a weighted
+basis. The standard SA mechanism has quadratic complexity with the sequence
+length, which impedes its utility to long sequences appearing in high
+resolution vision. Recently, inspired by operator learning for PDEs, Adaptive
+Fourier Neural Operators (AFNO) were introduced for high resolution attention
+based on global convolution that is efficiently implemented via FFT. However,
+the AFNO global filtering cannot well represent small and moderate scale
+structures that commonly appear in natural images. To leverage the
+coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention
+(MWA) by leveraging wavelet neural operators which incurs linear complexity in
+the sequence size. We replace the attention in ViT with MWA and our experiments
+with CIFAR and Tiny-ImageNet classification demonstrate significant improvement
+over alternative Fourier-based attentions such as AFNO and Global Filter
+Network (GFN).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Downstream-agnostic Adversarial Examples <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhou, Shengshan Hu, Ruizhi Zhao, Qian Wang, Leo Yu Zhang, Junhui Hou, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning usually uses a large amount of unlabeled data to
+pre-train an encoder which can be used as a general-purpose feature extractor,
+such that downstream users only need to perform fine-tuning operations to enjoy
+the benefit of "large model". Despite this promising prospect, the security of
+pre-trained encoder has not been thoroughly investigated yet, especially when
+the pre-trained encoder is publicly available for commercial use.
+  In this paper, we propose AdvEncoder, the first framework for generating
+downstream-agnostic universal adversarial examples based on the pre-trained
+encoder. AdvEncoder aims to construct a universal adversarial perturbation or
+patch for a set of natural images that can fool all the downstream tasks
+inheriting the victim pre-trained encoder. Unlike traditional adversarial
+example works, the pre-trained encoder only outputs feature vectors rather than
+classification labels. Therefore, we first exploit the high frequency component
+information of the image to guide the generation of adversarial examples. Then
+we design a generative attack framework to construct adversarial
+perturbations/patches by learning the distribution of the attack surrogate
+dataset to improve their attack success rates and transferability. Our results
+show that an attacker can successfully attack downstream tasks without knowing
+either the pre-training dataset or the downstream dataset. We also tailor four
+defenses for pre-trained encoders, the results of which further prove the
+attack ability of AdvEncoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by the International Conference on
+  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuralReshaper: Single-image Human-body Retouching with Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.10496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.10496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beijia Chen, Yuefan Shen, Hongbo Fu, Xiang Chen, Kun Zhou, Youyi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present NeuralReshaper, a novel method for semantic
+reshaping of human bodies in single images using deep generative networks. To
+achieve globally coherent reshaping effects, our approach follows a
+fit-then-reshape pipeline, which first fits a parametric 3D human model to a
+source human image and then reshapes the fitted 3D model with respect to
+user-specified semantic attributes. Previous methods rely on image warping to
+transfer 3D reshaping effects to the entire image domain and thus often cause
+distortions in both foreground and background. In contrast, we resort to
+generative adversarial nets conditioned on the source image and a 2D warping
+field induced by the reshaped 3D model, to achieve more realistic reshaping
+results. Specifically, we separately encode the foreground and background
+information in the source image using a two-headed UNet-like generator, and
+guide the information flow from the foreground branch to the background branch
+via feature space warping. Furthermore, to deal with the lack-of-data problem
+that no paired data exist (i.e., the same human bodies in varying shapes), we
+introduce a novel self-supervised strategy to train our network. Unlike
+previous methods that often require manual efforts to correct undesirable
+artifacts caused by incorrect body-to-image fitting, our method is fully
+automatic. Extensive experiments on both indoor and outdoor datasets
+demonstrate the superiority of our method over previous approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Mobile Block for Efficient Attention-based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01146v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01146v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangning Zhang, Xiangtai Li, Jian Li, Liang Liu, Zhucun Xue, Boshen Zhang, Zhengkai Jiang, Tianxin Huang, Yabiao Wang, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on developing modern, efficient, lightweight models for
+dense predictions while trading off parameters, FLOPs, and performance.
+Inverted Residual Block (IRB) serves as the infrastructure for lightweight
+CNNs, but no counterpart has been recognized by attention-based studies. This
+work rethinks lightweight infrastructure from efficient IRB and effective
+components of Transformer from a unified perspective, extending CNN-based IRB
+to attention-based models and abstracting a one-residual Meta Mobile Block
+(MMB) for lightweight model design. Following simple but effective design
+criterion, we deduce a modern Inverted Residual Mobile Block (iRMB) and build a
+ResNet-like Efficient MOdel (EMO) with only iRMB for down-stream tasks.
+Extensive experiments on ImageNet-1K, COCO2017, and ADE20K benchmarks
+demonstrate the superiority of our EMO over state-of-the-art methods, e.g.,
+EMO-1M/2M/5M achieve 71.5, 75.1, and 78.4 Top-1 that surpass equal-order
+CNN-/Attention-based models, while trading-off the parameter, efficiency, and
+accuracy well: running 2.8-4.0x faster than EdgeNeXt on iPhone14.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WaterScenes: A Multi-Task 4D Radar-Camera Fusion <span class="highlight-title">Dataset</span> and Benchmark
+  for Autonomous Driving on Water Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanliang Yao, Runwei Guan, Zhaodong Wu, Yi Ni, Zile Huang, Zixian Zhang, Yong Yue, Weiping Ding, Eng Gee Lim, Hyungjoon Seo, Ka Lok Man, Xiaohui Zhu, Yutao Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving on water surfaces plays an essential role in executing
+hazardous and time-consuming missions, such as maritime surveillance, survivors
+rescue, environmental monitoring, hydrography mapping and waste cleaning. This
+work presents WaterScenes, the first multi-task 4D radar-camera fusion dataset
+for autonomous driving on water surfaces. Equipped with a 4D radar and a
+monocular camera, our Unmanned Surface Vehicle (USV) proffers all-weather
+solutions for discerning object-related information, including color, shape,
+texture, range, velocity, azimuth, and elevation. Focusing on typical static
+and dynamic objects on water surfaces, we label the camera images and radar
+point clouds at pixel-level and point-level, respectively. In addition to basic
+perception tasks, such as object detection, instance segmentation and semantic
+segmentation, we also provide annotations for free-space segmentation and
+waterline segmentation. Leveraging the multi-task and multi-modal data, we
+conduct benchmark experiments on the uni-modality of radar and camera, as well
+as the fused modalities. Experimental results demonstrate that 4D radar-camera
+fusion can considerably improve the accuracy and robustness of perception on
+water surfaces, especially in adverse lighting and weather conditions.
+WaterScenes dataset is public on https://waterscenes.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01006v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01006v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengju Ye, Wei Jing, Chunyong Hu, Shikun Huang, Lingping Gao, Fangzhen Li, Jingke Wang, Ke Guo, Wencong Xiao, Weibo Mao, Hang Zheng, Kun Li, Junbo Chen, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building a multi-modality multi-task neural network toward accurate and
+robust performance is a de-facto standard in perception task of autonomous
+driving. However, leveraging such data from multiple sensors to jointly
+optimize the prediction and planning tasks remains largely unexplored. In this
+paper, we present FusionAD, to the best of our knowledge, the first unified
+framework that fuse the information from two most critical sensors, camera and
+LiDAR, goes beyond perception task. Concretely, we first build a transformer
+based multi-modality fusion network to effectively produce fusion based
+features. In constrast to camera-based end-to-end method UniAD, we then
+establish a fusion aided modality-aware prediction and status-aware planning
+modules, dubbed FMSPnP that take advantages of multi-modality features. We
+conduct extensive experiments on commonly used benchmark nuScenes dataset, our
+FusionAD achieves state-of-the-art performance and surpassing baselines on
+average 15% on perception tasks like detection and tracking, 10% on occupancy
+prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score
+and reduces the collision rate from 0.31% to only 0.12%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decision-BADGE: Decision-based Adversarial Batch Attack with Directional
+  Gradient Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geunhyeok Yu, Minwoo Jeon, Hyoseok Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The susceptibility of deep neural networks (DNNs) to adversarial examples has
+prompted an increase in the deployment of adversarial attacks. Image-agnostic
+universal adversarial perturbations (UAPs) are much more threatening, but many
+limitations exist to implementing UAPs in real-world scenarios where only
+binary decisions are returned. In this research, we propose Decision-BADGE, a
+novel method to craft universal adversarial perturbations for executing
+decision-based black-box attacks. To optimize perturbation with decisions, we
+addressed two challenges, namely the magnitude and the direction of the
+gradient. First, we use batch loss, differences from distributions of ground
+truth, and accumulating decisions in batches to determine the magnitude of the
+gradient. This magnitude is applied in the direction of the revised
+simultaneous perturbation stochastic approximation (SPSA) to update the
+perturbation. This simple yet efficient method can be easily extended to
+score-based attacks as well as targeted attacks. Experimental validation across
+multiple victim models demonstrates that the Decision-BADGE outperforms
+existing attack methods, even image-specific and score-based attacks. In
+particular, our proposed method shows a superior success rate with less
+training time. The research also shows that Decision-BADGE can successfully
+deceive unseen victim models and accurately target specific classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (7 pages except for references), 4 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric
+  Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Hu, Chenyou Fan, Liguang Zhou, Qing Gao, Honghai Liu, Tin Lun Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancements in autonomous driving and robot navigation, there
+is a growing demand for lifelong learning models capable of estimating metric
+(absolute) depth. Lifelong learning approaches potentially offer significant
+cost savings in terms of model training, data storage, and collection. However,
+the quality of RGB images and depth maps is sensor-dependent, and depth maps in
+the real world exhibit domain-specific characteristics, leading to variations
+in depth ranges. These challenges limit existing methods to lifelong learning
+scenarios with small domain gaps and relative depth map estimation. To
+facilitate lifelong metric depth learning, we identify three crucial technical
+challenges that require attention: i) developing a model capable of addressing
+the depth scale variation through scale-aware depth learning, ii) devising an
+effective learning strategy to handle significant domain gaps, and iii)
+creating an automated solution for domain-aware depth inference in practical
+applications. Based on the aforementioned considerations, in this paper, we
+present i) a lightweight multi-head framework that effectively tackles the
+depth scale imbalance, ii) an uncertainty-aware lifelong learning solution that
+adeptly handles significant domain gaps, and iii) an online domain-specific
+predictor selection method for real-time inference. Through extensive numerical
+studies, we show that the proposed method can achieve good efficiency,
+stability, and plasticity, leading the benchmarks by 8% to 15%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Logistic-Normal Likelihoods for Heteroscedastic Label Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Englesson, Amir Mehrpanah, Hossein Azizpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A natural way of estimating heteroscedastic label noise in regression is to
+model the observed (potentially noisy) target as a sample from a normal
+distribution, whose parameters can be learned by minimizing the negative
+log-likelihood. This formulation has desirable loss attenuation properties, as
+it reduces the contribution of high-error examples. Intuitively, this behavior
+can improve robustness against label noise by reducing overfitting. We propose
+an extension of this simple and probabilistic approach to classification that
+has the same desirable loss attenuation properties. Furthermore, we discuss and
+address some practical challenges of this extension. We evaluate the
+effectiveness of the method by measuring its robustness against label noise in
+classification. We perform enlightening experiments exploring the inner
+workings of the method, including sensitivity to hyperparameters, ablation
+studies, and other insightful analyses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale
+  Multi-Attribute and Language Search Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02898v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02898v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyu Yang, Yinan Zhou, Yaxiong Wang, Yujiao Wu, Li Zhu, Zhedong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a large Multi-Attribute and Language Search
+dataset for text-based person retrieval, called MALS, and explore the
+feasibility of performing pre-training on both attribute recognition and
+image-text matching tasks in one stone. In particular, MALS contains 1,510,330
+image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,
+and all images are annotated with 27 attributes. Considering the privacy
+concerns and annotation costs, we leverage the off-the-shelf diffusion models
+to generate the dataset. To verify the feasibility of learning from the
+generated data, we develop a new joint Attribute Prompt Learning and Text
+Matching Learning (APTM) framework, considering the shared knowledge between
+attribute and text. As the name implies, APTM contains an attribute prompt
+learning stream and a text matching learning stream. (1) The attribute prompt
+learning leverages the attribute prompts for image-attribute alignment, which
+enhances the text matching learning. (2) The text matching learning facilitates
+the representation learning on fine-grained details, and in turn, boosts the
+attribute prompt learning. Extensive experiments validate the effectiveness of
+the pre-training on MALS, achieving state-of-the-art retrieval performance via
+APTM on three challenging real-world benchmarks. In particular, APTM achieves a
+consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on
+CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chaos to Order: A Label Propagation Perspective on Source-Free Domain
+  Adaptation <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08413v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08413v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunwei Wu, Guitao Cao, Yan Li, Xidong Xi, Wenming Cao, Hong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain adaptation (SFDA), where only a pre-trained source model
+is used to adapt to the target distribution, is a more general approach to
+achieving domain adaptation in the real world. However, it can be challenging
+to capture the inherent structure of the target features accurately due to the
+lack of supervised information on the target domain. By analyzing the
+clustering performance of the target features, we show that they still contain
+core features related to discriminative attributes but lack the collation of
+semantic information. Inspired by this insight, we present Chaos to Order
+(CtO), a novel approach for SFDA that strives to constrain semantic credibility
+and propagate label information among target subpopulations. CtO divides the
+target data into inner and outlier samples based on the adaptive threshold of
+the learning state, customizing the learning strategy to fit the data
+properties best. Specifically, inner samples are utilized for learning
+intra-class structure thanks to their relatively well-clustered properties. The
+low-density outlier samples are regularized by input consistency to achieve
+high accuracy with respect to the ground truth labels. In CtO, by employing
+different learning strategies to propagate the labels from the inner local to
+outlier instances, it clusters the global samples from chaos to order. We
+further adaptively regulate the neighborhood affinity of the inner samples to
+constrain the local semantic credibility. In theoretical and empirical
+analyses, we demonstrate that our algorithm not only propagates from inner to
+outlier but also prevents local clustering from forming spurious clusters.
+Empirical evidence demonstrates that CtO outperforms the state of the arts on
+three public benchmarks: Office-31, Office-Home, and VisDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Deeply Unified Depth-aware Panoptic Segmentation with
+  Bi-directional Guidance Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14786v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14786v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen He, Yifan Wang, Lijun Wang, Huchuan Lu, Jun-Yan He, Jin-Peng Lan, Bin Luo, Yifeng Geng, Xuansong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth-aware panoptic segmentation is an emerging topic in computer vision
+which combines semantic and geometric understanding for more robust scene
+interpretation. Recent works pursue unified frameworks to tackle this challenge
+but mostly still treat it as two individual learning tasks, which limits their
+potential for exploring cross-domain information. We propose a deeply unified
+framework for depth-aware panoptic segmentation, which performs joint
+segmentation and depth estimation both in a per-segment manner with identical
+object queries. To narrow the gap between the two tasks, we further design a
+geometric query enhancement method, which is able to integrate scene geometry
+into object queries using latent representations. In addition, we propose a
+bi-directional guidance learning approach to facilitate cross-task feature
+learning by taking advantage of their mutual relations. Our method sets the new
+state of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS
+and SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown
+to deliver performance improvement even under incomplete supervision labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A jet tagging algorithm of graph network with HaarPooling message
+  passing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13869v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13869v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Ma, Feiyi Liu, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently methods of graph neural networks (GNNs) have been applied to solving
+the problems in high energy physics (HEP) and have shown its great potential
+for quark-gluon tagging with graph representation of jet events. In this paper,
+we introduce an approach of GNNs combined with a HaarPooling operation to
+analyze the events, called HaarPooling Message Passing neural network (HMPNet).
+In HMPNet, HaarPooling not only extracts the features of graph, but embeds
+additional information obtained by clustering of k-means of different particle
+features. We construct Haarpooling from five different features: absolute
+energy $\log E$, transverse momentum $\log p_T$, relative coordinates
+$(\Delta\eta,\Delta\phi)$, the mixed ones $(\log E, \log p_T)$ and $(\log E,
+\log p_T, \Delta\eta,\Delta\phi)$. The results show that an appropriate
+selection of information for HaarPooling enhances the accuracy of quark-gluon
+tagging, as adding extra information of $\log P_T$ to the HMPNet outperforms
+all the others, whereas adding relative coordinates information
+$(\Delta\eta,\Delta\phi)$ is not very effective. This implies that by adding
+effective particle features from HaarPooling can achieve much better results
+than solely pure message passing neutral network (MPNN) can do, which
+demonstrates significant improvement of feature extraction via the pooling
+process. Finally we compare the HMPNet study, ordering by $p_T$, with other
+studies and prove that the HMPNet is also a good choice of GNN algorithms for
+jet tagging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAFW: A Large-scale, Multi-modal, Compound Affective Database for
+  Dynamic Facial Expression Recognition in the Wild <span class="chip">ACM MM'22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Liu, Wei Dai, Chuanxu Feng, Wenbin Wang, Guanghao Yin, Jiabei Zeng, Shiguang Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (FER) databases provide important data
+support for affective computing and applications. However, most FER databases
+are annotated with several basic mutually exclusive emotional categories and
+contain only one modality, e.g., videos. The monotonous labels and modality
+cannot accurately imitate human emotions and fulfill applications in the real
+world. In this paper, we propose MAFW, a large-scale multi-modal compound
+affective database with 10,045 video-audio clips in the wild. Each clip is
+annotated with a compound emotional category and a couple of sentences that
+describe the subjects' affective behaviors in the clip. For the compound
+emotion annotation, each clip is categorized into one or more of the 11
+widely-used emotions, i.e., anger, disgust, fear, happiness, neutral, sadness,
+surprise, contempt, anxiety, helplessness, and disappointment. To ensure high
+quality of the labels, we filter out the unreliable annotations by an
+Expectation Maximization (EM) algorithm, and then obtain 11 single-label
+emotion categories and 32 multi-label emotion categories. To the best of our
+knowledge, MAFW is the first in-the-wild multi-modal database annotated with
+compound emotion annotations and emotion-related captions. Additionally, we
+also propose a novel Transformer-based expression snippet feature learning
+method to recognize the compound emotions leveraging the expression-change
+relations among different emotions and modalities. Extensive experiments on
+MAFW database show the advantages of the proposed method over other
+state-of-the-art methods for both uni- and multi-modal FER. Our MAFW database
+is publicly available from https://mafw-database.github.io/MAFW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM MM'22</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aerial-Ground Person Re-ID <span class="chip">ICME2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08597v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08597v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Nguyen, Kien Nguyen, Sridha Sridharan, Clinton Fookes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person re-ID matches persons across multiple non-overlapping cameras. Despite
+the increasing deployment of airborne platforms in surveillance, current
+existing person re-ID benchmarks' focus is on ground-ground matching and very
+limited efforts on aerial-aerial matching. We propose a new benchmark dataset -
+AG-ReID, which performs person re-ID matching in a new setting: across aerial
+and ground cameras. Our dataset contains 21,983 images of 388 identities and 15
+soft attributes for each identity. The data was collected by a UAV flying at
+altitudes between 15 to 45 meters and a ground-based CCTV camera on a
+university campus. Our dataset presents a novel elevated-viewpoint challenge
+for person re-ID due to the significant difference in person appearance across
+these cameras. We propose an explainable algorithm to guide the person re-ID
+model's training with soft attributes to address this challenge. Experiments
+demonstrate the efficacy of our method on the aerial-ground person re-ID task.
+The dataset will be published and the baseline codes will be open-sourced at
+https://github.com/huynguyen792/AG-ReID to facilitate research in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on IEEE International Conference on Multimedia and Expo
+  2023 (ICME2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Implicit Framework for Fast NeRF Composition and Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Gao, Ziyi Yang, Yunlu Zhao, Yuxiang Sun, Xiaogang Jin, Changqing Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A variety of Neural Radiance Fields (NeRF) methods have recently achieved
+remarkable success in high render speed. However, current accelerating methods
+are specialized and incompatible with various implicit methods, preventing
+real-time composition over various types of NeRF works. Because NeRF relies on
+sampling along rays, it is possible to provide general guidance for
+acceleration. To that end, we propose a general implicit pipeline for composing
+NeRF objects quickly. Our method enables the casting of dynamic shadows within
+or between objects using analytical light sources while allowing multiple NeRF
+objects to be seamlessly placed and rendered together with any arbitrary rigid
+transformations. Mainly, our work introduces a new surface representation known
+as Neural Depth Fields (NeDF) that quickly determines the spatial relationship
+between objects by allowing direct intersection computation between rays and
+implicit surfaces. It leverages an intersection neural network to query NeRF
+for acceleration instead of depending on an explicit spatial structure.Our
+proposed method is the first to enable both the progressive and interactive
+composition of NeRF objects. Additionally, it also serves as a previewing
+plugin for a range of existing NeRF works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages for main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Embarrassingly Simple Backdoor Attack on <span class="highlight-title">Self-supervised</span> Learning <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changjiang Li, Ren Pang, Zhaohan Xi, Tianyu Du, Shouling Ji, Yuan Yao, Ting Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a new paradigm in machine learning, self-supervised learning (SSL) is
+capable of learning high-quality representations of complex data without
+relying on labels. In addition to eliminating the need for labeled data,
+research has found that SSL improves the adversarial robustness over supervised
+learning since lacking labels makes it more challenging for adversaries to
+manipulate model predictions. However, the extent to which this robustness
+superiority generalizes to other types of attacks remains an open question.
+  We explore this question in the context of backdoor attacks. Specifically, we
+design and evaluate CTRL, an embarrassingly simple yet highly effective
+self-supervised backdoor attack. By only polluting a tiny fraction of training
+data (<= 1%) with indistinguishable poisoning samples, CTRL causes any
+trigger-embedded input to be misclassified to the adversary's designated class
+with a high probability (>= 99%) at inference time. Our findings suggest that
+SSL and supervised learning are comparably vulnerable to backdoor attacks. More
+importantly, through the lens of CTRL, we study the inherent vulnerability of
+SSL to backdoor attacks. With both empirical and analytical evidence, we reveal
+that the representation invariance property of SSL, which benefits adversarial
+robustness, may also be the very reason making \ssl highly susceptible to
+backdoor attacks. Our findings also imply that the existing defenses against
+supervised backdoor attacks are not easily retrofitted to the unique
+vulnerability of SSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 2023 International Conference on Computer Vision (ICCV '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent-Controller Representations: Principled Offline RL with Rich
+  Exogenous Information <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riashat Islam, Manan Tomar, Alex Lamb, Yonathan Efroni, Hongyu Zang, Aniket Didolkar, Dipendra Misra, Xin Li, Harm van Seijen, Remi Tachet des Combes, John Langford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning to control an agent from data collected offline in a rich
+pixel-based visual observation space is vital for real-world applications of
+reinforcement learning (RL). A major challenge in this setting is the presence
+of input information that is hard to model and irrelevant to controlling the
+agent. This problem has been approached by the theoretical RL community through
+the lens of exogenous information, i.e, any control-irrelevant information
+contained in observations. For example, a robot navigating in busy streets
+needs to ignore irrelevant information, such as other people walking in the
+background, textures of objects, or birds in the sky. In this paper, we focus
+on the setting with visually detailed exogenous information, and introduce new
+offline RL benchmarks offering the ability to study this problem. We find that
+contemporary representation learning techniques can fail on datasets where the
+noise is a complex and time dependent process, which is prevalent in practical
+applications. To address these, we propose to use multi-step inverse models,
+which have seen a great deal of interest in the RL theory community, to learn
+Agent-Controller Representations for Offline-RL (ACRO). Despite being simple
+and requiring no reward, we show theoretically and empirically that the
+representation created by this objective greatly outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">21</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Attribute Matrix Factorization Model with Shared User Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen Liang, Zeng Fan, Youzhi Liang, Jianguo Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, deep learning has firmly established its prowess
+across various domains, including computer vision, speech recognition, and
+natural language processing. Motivated by its outstanding success, researchers
+have been directing their efforts towards applying deep learning techniques to
+recommender systems. Neural collaborative filtering (NCF) and Neural Matrix
+Factorization (NeuMF) refreshes the traditional inner product in matrix
+factorization with a neural architecture capable of learning complex and
+data-driven functions. While these models effectively capture user-item
+interactions, they overlook the specific attributes of both users and items.
+This can lead to robustness issues, especially for items and users that belong
+to the "long tail". Such challenges are commonly recognized in recommender
+systems as a part of the cold-start problem. A direct and intuitive approach to
+address this issue is by leveraging the features and attributes of the items
+and users themselves. In this paper, we introduce a refined NeuMF model that
+considers not only the interaction between users and items, but also acrossing
+associated attributes. Moreover, our proposed architecture features a shared
+user embedding, seamlessly integrating with user embeddings to imporve the
+robustness and effectively address the cold-start problem. Rigorous experiments
+on both the Movielens and Pinterest datasets demonstrate the superiority of our
+Cross-Attribute Matrix Factorization model, particularly in scenarios
+characterized by higher dataset sparsity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Ningyu Zhang, Xin Xie, Yunzhi Yao, Bozhong Tian, Mengru Wang, Zekun Xi, Siyuan Cheng, Kangwei Liu, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy
+issues, which means they are unaware of unseen events or generate text with
+incorrect facts owing to the outdated/noisy data. To this end, many knowledge
+editing approaches for LLMs have emerged -- aiming to subtly inject/edit
+updated knowledge or adjust undesired behavior while minimizing the impact on
+unrelated inputs. Nevertheless, due to significant differences among various
+knowledge editing methods and the variations in task setups, there is no
+standard implementation framework available for the community, which hinders
+practitioners to apply knowledge editing to applications. To address these
+issues, we propose EasyEdit, an easy-to-use knowledge editing framework for
+LLMs. It supports various cutting-edge knowledge editing approaches and can be
+readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.
+Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,
+demonstrating that knowledge editing surpasses traditional fine-tuning in terms
+of reliability and generalization. We have released the source code on GitHub
+at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and
+comprehensive documentation for beginners to get started. Besides, we present
+an online system for real-time knowledge editing, and a demo video at
+http://knowlm.zjukg.cn/easyedit.mp4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website is https://github.com/zjunlp/EasyEdit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-GEF: Multi-modal representation meet collaborative filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wu, Alejandro Ariza-Casabona, Bartłomiej Twardowski, Tri Kurniawan Wijaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern e-commerce, item content features in various modalities offer
+accurate yet comprehensive information to recommender systems. The majority of
+previous work either focuses on learning effective item representation during
+modelling user-item interactions, or exploring item-item relationships by
+analysing multi-modal features. Those methods, however, fail to incorporate the
+collaborative item-user-item relationships into the multi-modal feature-based
+item structure. In this work, we propose a graph-based item structure
+enhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion,
+which effectively combines the latent item structure underlying multi-modal
+contents with the collaborative signals. Instead of processing the content
+feature in different modalities separately, we show that the early-fusion of
+multi-modal features provides significant improvement. MM-GEF learns refined
+item representations by injecting structural information obtained from both
+multi-modal and collaborative signals. Through extensive experiments on four
+publicly available datasets, we demonstrate systematical improvements of our
+method over state-of-the-art multi-modal recommendation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ gSASRec: Reducing Overconfidence in Sequential Recommendation Trained
+  with Negative Sampling <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandr Petrov, Craig Macdonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A large catalogue size is one of the central challenges in training
+recommendation models: a large number of items makes them memory and
+computationally inefficient to compute scores for all items during training,
+forcing these models to deploy negative sampling. However, negative sampling
+increases the proportion of positive interactions in the training data, and
+therefore models trained with negative sampling tend to overestimate the
+probabilities of positive interactions a phenomenon we call overconfidence.
+While the absolute values of the predicted scores or probabilities are not
+important for the ranking of retrieved recommendations, overconfident models
+may fail to estimate nuanced differences in the top-ranked items, resulting in
+degraded performance. In this paper, we show that overconfidence explains why
+the popular SASRec model underperforms when compared to BERT4Rec. This is
+contrary to the BERT4Rec authors explanation that the difference in performance
+is due to the bi-directional attention mechanism. To mitigate overconfidence,
+we propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and
+theoretically prove that it can mitigate overconfidence. We further propose the
+gSASRec model, an improvement over SASRec that deploys an increased number of
+negatives and the gBCE loss. We show through detailed experiments on three
+datasets that gSASRec does not exhibit the overconfidence problem. As a result,
+gSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),
+while requiring less training time (e.g. -73% training time on MovieLens-1M).
+Moreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that
+contain more than 1 million items.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM RecSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Natural Language is All a Graph Needs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruosong Ye, Caiqi Zhang, Runhui Wang, Shuyuan Xu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large-scale pre-trained language models, such as ChatGPT,
+has revolutionized various research fields in artificial intelligence.
+Transformers-based large language models (LLMs) have gradually replaced CNNs
+and RNNs to unify fields of computer vision and natural language processing.
+Compared with the data that exists relatively independently such as images,
+videos or texts, graph is a type of data that contains rich structural and
+relational information. Meanwhile, natural language, as one of the most
+expressive mediums, excels in describing complex structures. However, existing
+work on incorporating graph learning problems into the generative language
+modeling framework remains very limited. As the importance of language models
+continues to grow, it becomes essential to explore whether LLMs can also
+replace GNNs as the foundational model for graphs. In this paper, we propose
+InstructGLM (Instruction-finetuned Graph Language Model), systematically design
+highly scalable prompts based on natural language instructions, and use natural
+language to describe the geometric structure and node features of the graph for
+instruction tuning an LLMs to perform learning and inference on graphs in a
+generative manner. Our method exceeds all competitive GNN baselines on
+ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of
+our method and sheds light on generative language models replacing GNNs as the
+foundation model for graph machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Information Retrieval: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a primary means of information acquisition, information retrieval (IR)
+systems, such as search engines, have integrated themselves into our daily
+lives. These systems also serve as components of dialogue, question-answering,
+and recommender systems. The trajectory of IR has evolved dynamically from its
+origins in term-based methods to its integration with advanced neural models.
+While the neural models excel at capturing complex contextual signals and
+semantic nuances, thereby reshaping the IR landscape, they still face
+challenges such as data scarcity, interpretability, and the generation of
+contextually plausible yet potentially inaccurate responses. This evolution
+requires a combination of both traditional methods (such as term-based sparse
+retrieval methods with rapid response) and modern neural architectures (such as
+language models with powerful language understanding capacity). Meanwhile, the
+emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has
+revolutionized natural language processing due to their remarkable language
+understanding, generation, generalization, and reasoning abilities.
+Consequently, recent research has sought to leverage LLMs to improve IR
+systems. Given the rapid evolution of this research trajectory, it is necessary
+to consolidate existing methodologies and provide nuanced insights through a
+comprehensive overview. In this survey, we delve into the confluence of LLMs
+and IR systems, including crucial aspects such as query rewriters, retrievers,
+rerankers, and readers. Additionally, we explore promising directions within
+this expanding field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UIPC-MF: User-Item Prototype Connection Matrix Factorization for
+  Explainable Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Pan, Von-Wun Soo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommending items to potentially interested users has been an important
+commercial task that faces two main challenges: accuracy and explainability.
+While most collaborative filtering models rely on statistical computations on a
+large scale of interaction data between users and items and can achieve high
+performance, they often lack clear explanatory power. We propose UIPC-MF, a
+prototype-based matrix factorization method for explainable collaborative
+filtering recommendations. In UIPC-MF, both users and items are associated with
+sets of prototypes, capturing general collaborative attributes. To enhance
+explainability, UIPC-MF learns connection weights that reflect the associative
+relations between user and item prototypes for recommendations. UIPC-MF
+outperforms other prototype-based baseline methods in terms of Hit Ratio and
+Normalized Discounted Cumulative Gain on three datasets, while also providing
+better transparency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Scientometrics and Reciprocality Underlying Co-Authorship Panels in
+  Google Scholar Profiles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Alexi, Teddy Lazebnik, Ariel Rosenfeld
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online academic profiles are used by scholars to reflect a desired image to
+their online audience. In Google Scholar, scholars can select a subset of
+co-authors for presentation in a central location on their profile using a
+social feature called the Co-authroship panel. In this work, we examine whether
+scientometrics and reciprocality can explain the observed selections. To this
+end, we scrape and thoroughly analyze a novel set of 120,000 Google Scholar
+profiles, ranging across four disciplines and various academic institutions.
+Our results suggest that scholars tend to favor co-authors with higher
+scientometrics over others for inclusion in their co-authorship panels.
+Interestingly, as one's own scientometrics are higher, the tendency to include
+co-authors with high scientometrics is diminishing. Furthermore, we find that
+reciprocality is central to explaining scholars' selections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrete Conditional Diffusion for Reranking in Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Lin, Xiaokai Chen, Chenyang Wang, Hantao Shu, Linfeng Song, Biao Li, Peng jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reranking plays a crucial role in modern multi-stage recommender systems by
+rearranging the initial ranking list to model interplay between items.
+Considering the inherent challenges of reranking such as combinatorial
+searching space, some previous studies have adopted the evaluator-generator
+paradigm, with a generator producing feasible sequences and a evaluator
+selecting the best one based on estimated listwise utility. Inspired by the
+remarkable success of diffusion generative models, this paper explores the
+potential of diffusion models for generating high-quality sequences in
+reranking. However, we argue that it is nontrivial to take diffusion models as
+the generator in the context of recommendation. Firstly, diffusion models
+primarily operate in continuous data space, differing from the discrete data
+space of item permutations. Secondly, the recommendation task is different from
+conventional generation tasks as the purpose of recommender systems is to
+fulfill user interests. Lastly, real-life recommender systems require
+efficiency, posing challenges for the inference of diffusion models. To
+overcome these challenges, we propose a novel Discrete Conditional Diffusion
+Reranking (DCDR) framework for recommendation. DCDR extends traditional
+diffusion models by introducing a discrete forward process with tractable
+posteriors, which adds noise to item sequences through step-wise discrete
+operations (e.g., swapping). Additionally, DCDR incorporates a conditional
+reverse process that generates item sequences conditioned on expected user
+responses. Extensive offline experiments conducted on public datasets
+demonstrate that DCDR outperforms state-of-the-art reranking methods.
+Furthermore, DCDR has been deployed in a real-world video app with over 300
+million daily active users, significantly enhancing online recommendation
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoAssign+: Automatic Shared Embedding Assignment in Streaming
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Liu, Kecheng Chen, Fengyi Song, Bo Chen, Xiangyu Zhao, Huifeng Guo, Ruiming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of streaming recommender systems, conventional methods for
+addressing new user IDs or item IDs typically involve assigning initial ID
+embeddings randomly. However, this practice results in two practical
+challenges: (i) Items or users with limited interactive data may yield
+suboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs
+necessitates consistently expanding the embedding table, leading to unnecessary
+memory consumption. In light of these concerns, we introduce a reinforcement
+learning-driven framework, namely AutoAssign+, that facilitates Automatic
+Shared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an
+Identity Agent as an actor network, which plays a dual role: (i) Representing
+low-frequency IDs field-wise with a small set of shared embeddings to enhance
+the embedding initialization, and (ii) Dynamically determining which ID
+features should be retained or eliminated in the embedding table. The policy of
+the agent is optimized with the guidance of a critic network. To evaluate the
+effectiveness of our approach, we perform extensive experiments on three
+commonly used benchmark datasets. Our experiment results demonstrate that
+AutoAssign+ is capable of significantly enhancing recommendation performance by
+mitigating the cold-start problem. Furthermore, our framework yields a
+reduction in memory usage of approximately 20-30%, verifying its practical
+effectiveness and efficiency for streaming recommender systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Offline-Online Evaluation with a Time-dependent and Popularity
+  Bias-free Offline Metric for Recommenders <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petr Kasalický, Rodrigo Alves, Pavel Kordík
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of recommendation systems is a complex task. The offline and
+online evaluation metrics for recommender systems are ambiguous in their true
+objectives. The majority of recently published papers benchmark their methods
+using ill-posed offline evaluation methodology that often fails to predict true
+online performance. Because of this, the impact that academic research has on
+the industry is reduced. The aim of our research is to investigate and compare
+the online performance of offline evaluation metrics. We show that penalizing
+popular items and considering the time of transactions during the evaluation
+significantly improves our ability to choose the best recommendation model for
+a live recommender system. Our results, averaged over five large-size
+real-world live data procured from recommenders, aim to help the academic
+community to understand better offline evaluation and optimization criteria
+that are more relevant for real applications of recommender systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to evalRS 2023@KDD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoSeqRec: Autoencoder for Efficient Sequential Recommendation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijia Liu, Jiahao Liu, Hansu Gu, Dongsheng Li, Tun Lu, Peng Zhang, Ning Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation demonstrates the capability to recommend items by
+modeling the sequential behavior of users. Traditional methods typically treat
+users as sequences of items, overlooking the collaborative relationships among
+them. Graph-based methods incorporate collaborative information by utilizing
+the user-item interaction graph. However, these methods sometimes face
+challenges in terms of time complexity and computational efficiency. To address
+these limitations, this paper presents AutoSeqRec, an incremental
+recommendation model specifically designed for sequential recommendation tasks.
+AutoSeqRec is based on autoencoders and consists of an encoder and three
+decoders within the autoencoder architecture. These components consider both
+the user-item interaction matrix and the rows and columns of the item
+transition matrix. The reconstruction of the user-item interaction matrix
+captures user long-term preferences through collaborative filtering. In
+addition, the rows and columns of the item transition matrix represent the item
+out-degree and in-degree hopping behavior, which allows for modeling the user's
+short-term interests. When making incremental recommendations, only the input
+matrices need to be updated, without the need to update parameters, which makes
+AutoSeqRec very efficient. Comprehensive evaluations demonstrate that
+AutoSeqRec outperforms existing methods in terms of accuracy, while showcasing
+its robustness and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NECE: Narrative Event Chain Extraction Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.08063v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.08063v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangxuan Xu, Paulina Toro Isaza, Moshi Li, Akintoye Oloko, Bingsheng Yao, Cassia Sanctos, Aminat Adebiyi, Yufang Hou, Nanyun Peng, Dakuo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To understand a narrative, it is essential to comprehend the temporal event
+flows, especially those associated with main characters; however, this can be
+challenging with lengthy and unstructured narrative texts. To address this, we
+introduce NECE, an open-access, document-level toolkit that automatically
+extracts and aligns narrative events in the temporal order of their occurrence.
+Through extensive evaluations, we show the high quality of the NECE toolkit and
+demonstrates its downstream application in analyzing narrative bias regarding
+gender. We also openly discuss the shortcomings of the current approach, and
+potential of leveraging generative models in future works. Lastly the NECE
+toolkit includes both a Python library and a user-friendly web interface, which
+offer equal access to professionals and layman audience alike, to visualize
+event chain, obtain narrative flows, or study narrative bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deconfounded Causal Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.07122v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.07122v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Xu, Juntao Tan, Shelby Heinecke, Jia Li, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems may be confounded by various types of confounding factors
+(also called confounders) that may lead to inaccurate recommendations and
+sacrificed recommendation performance. Current approaches to solving the
+problem usually design each specific model for each specific confounder.
+However, real-world systems may include a huge number of confounders and thus
+designing each specific model for each specific confounder could be
+unrealistic. More importantly, except for those ``explicit confounders'' that
+experts can manually identify and process such as item's position in the
+ranking list, there are also many ``latent confounders'' that are beyond the
+imagination of experts. For example, users' rating on a song may depend on
+their current mood or the current weather, and users' preference on ice creams
+may depend on the air temperature. Such latent confounders may be unobservable
+in the recorded training data. To solve the problem, we propose Deconfounded
+Causal Collaborative Filtering (DCCF). We first frame user behaviors with
+unobserved confounders into a causal graph, and then we design a front-door
+adjustment model carefully fused with machine learning to deconfound the
+influence of unobserved confounders. Experiments on real-world datasets show
+that our method is able to deconfound unobserved confounders to achieve better
+recommendation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM Transactions on Recommender Systems (TORS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Collaborative Filtering <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2102.01868v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2102.01868v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Xu, Yingqiang Ge, Yunqi Li, Zuohui Fu, Xu Chen, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many of the traditional recommendation algorithms are designed based on the
+fundamental idea of mining or learning correlative patterns from data to
+estimate the user-item correlative preference. However, pure correlative
+learning may lead to Simpson's paradox in predictions, and thus results in
+sacrificed recommendation performance. Simpson's paradox is a well-known
+statistical phenomenon, which causes confusions in statistical conclusions and
+ignoring the paradox may result in inaccurate decisions. Fortunately, causal
+and counterfactual modeling can help us to think outside of the observational
+data for user modeling and personalization so as to tackle such issues. In this
+paper, we propose Causal Collaborative Filtering (CCF) -- a general framework
+for modeling causality in collaborative filtering and recommendation. We
+provide a unified causal view of CF and mathematically show that many of the
+traditional CF algorithms are actually special cases of CCF under simplified
+causal graphs. We then propose a conditional intervention approach for
+$do$-operations so that we can estimate the user-item causal preference based
+on the observational data. Finally, we further propose a general counterfactual
+constrained learning framework for estimating the user-item preferences.
+Experiments are conducted on two types of real-world datasets -- traditional
+and randomized trial data -- and results show that our framework can improve
+the recommendation performance and reduce the Simpson's paradox problem of many
+CF algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 2023 ACM SIGIR International Conference on Theory of
+  Information Retrieval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15464v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15464v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data catalogs play a crucial role in modern data-driven organizations by
+facilitating the discovery, understanding, and utilization of diverse data
+assets. However, ensuring their quality and reliability is complex, especially
+in open and large-scale data environments. This paper proposes a framework to
+automatically determine the quality of open data catalogs, addressing the need
+for efficient and reliable quality assessment mechanisms. Our framework can
+analyze various core quality dimensions, such as accuracy, completeness,
+consistency, scalability, and timeliness, offer several alternatives for the
+assessment of compatibility and similarity across such catalogs as well as the
+implementation of a set of non-core quality dimensions such as provenance,
+readability, and licensing. The goal is to empower data-driven organizations to
+make informed decisions based on trustworthy and well-curated data assets. The
+source code that illustrates our approach can be downloaded from
+https://www.github.com/jorge-martinez-gil/dataq/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STUDY: Socially Aware Temporally Causal Decoder Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eltayeb Ahmed, Diana Mincu, Lauren Harrell, Katherine Heller, Subhrajit Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are widely used to help people find items that are
+tailored to their interests. These interests are often influenced by social
+networks, making it important to use social network information effectively in
+recommender systems. This is especially true for demographic groups with
+interests that differ from the majority. This paper introduces STUDY, a
+Socially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a
+new socially-aware recommender system architecture that is significantly more
+efficient to learn and train than existing methods. STUDY performs joint
+inference over socially connected groups in a single forward pass of a modified
+transformer decoder network. We demonstrate the benefits of STUDY in the
+recommendation of books for students who are dyslexic, or struggling readers.
+Dyslexic students often have difficulty engaging with reading material, making
+it critical to recommend books that are tailored to their interests. We worked
+with our non-profit partner Learning Ally to evaluate STUDY on a dataset of
+struggling readers. STUDY was able to generate recommendations that more
+accurately predicted student engagement, when compared with existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with
+  <span class="highlight-title">Self-supervised</span> Learning <span class="chip">CIKM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05379v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05379v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Chen, Wei Chen, Jia Xu, Zhongyi Liu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relevance modeling aims to locate desirable items for corresponding queries,
+which is crucial for search engines to ensure user experience. Although most
+conventional approaches address this problem by assessing the semantic
+similarity between the query and item, pure semantic matching is not
+everything. In reality, auxiliary query-item interactions extracted from user
+historical behavior data of the search log could provide hints to reveal users'
+search intents further. Drawing inspiration from this, we devise a novel
+Behavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that
+leverages neighbor queries of target item and neighbor items of target query to
+complement target query-item semantic matching. Specifically, our model builds
+multi-level co-attention for distilling coarse-grained and fine-grained
+semantic representations from both neighbor and target views. The model
+subsequently employs neighbor-target self-supervised learning to improve the
+accuracy and robustness of BARL-ASe by strengthening representation and logit
+learning. Furthermore, we discuss how to deal with the long-tail query-item
+matching of the mini apps search scenario of Alipay practically. Experiments on
+real-world industry data and online A/B testing demonstrate our proposal
+achieves promising performance with low latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Distillation-enhanced Multi-modal <span class="highlight-title">Transformer</span> for Sequential
+  Recommendation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Ji, Xiangyan Liu, An Zhang, Yinwei Wei, Yongxin Ni, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal recommendation systems, which integrate diverse types of
+information, have gained widespread attention in recent years. However,
+compared to traditional collaborative filtering-based multi-modal
+recommendation systems, research on multi-modal sequential recommendation is
+still in its nascent stages. Unlike traditional sequential recommendation
+models that solely rely on item identifier (ID) information and focus on
+network structure design, multi-modal recommendation models need to emphasize
+item representation learning and the fusion of heterogeneous data sources. This
+paper investigates the impact of item representation learning on downstream
+recommendation tasks and examines the disparities in information fusion at
+different stages. Empirical experiments are conducted to demonstrate the need
+to design a framework suitable for collaborative learning and fusion of diverse
+information. Based on this, we propose a new model-agnostic framework for
+multi-modal sequential recommendation tasks, called Online
+Distillation-enhanced Multi-modal Transformer (ODMT), to enhance feature
+interaction and mutual learning among multi-source input (ID, text, and image),
+while avoiding conflicts among different features during training, thereby
+improving recommendation accuracy. To be specific, we first introduce an
+ID-aware Multi-modal Transformer module in the item representation learning
+stage to facilitate information interaction among different features. Secondly,
+we employ an online distillation training strategy in the prediction
+optimization stage to make multi-source data learn from each other and improve
+prediction robustness. Experimental results on a stream media recommendation
+dataset and three e-commerce recommendation datasets demonstrate the
+effectiveness of the proposed two modules, which is approximately 10%
+improvement in performance compared to baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures, accepted in ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KuaiSAR: A Unified Search And Recommendation <span class="highlight-title">Dataset</span> <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07705v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07705v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongxiang Sun, Zihua Si, Xiaoxue Zang, Dewei Leng, Yanan Niu, Yang Song, Xiao Zhang, Jun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The confluence of Search and Recommendation (S&R) services is vital to online
+services, including e-commerce and video platforms. The integration of S&R
+modeling is a highly intuitive approach adopted by industry practitioners.
+However, there is a noticeable lack of research conducted in this area within
+academia, primarily due to the absence of publicly available datasets.
+Consequently, a substantial gap has emerged between academia and industry
+regarding research endeavors in joint optimization using user behavior data
+from both S&R services. To bridge this gap, we introduce the first large-scale,
+real-world dataset KuaiSAR of integrated Search And Recommendation behaviors
+collected from Kuaishou, a leading short-video app in China with over 350
+million daily active users. Previous research in this field has predominantly
+employed publicly available semi-synthetic datasets and simulated, with
+artificially fabricated search behaviors. Distinct from previous datasets,
+KuaiSAR contains genuine user behaviors, including the occurrence of each
+interaction within either search or recommendation service, and the users'
+transitions between the two services. This work aids in joint modeling of S&R,
+and utilizing search data for recommender systems (and recommendation data for
+search engines). Furthermore, due to the various feedback labels associated
+with user-video interactions, KuaiSAR also supports a broad range of tasks,
+including intent recommendation, multi-task learning, and modeling of long
+sequential multi-behavioral patterns. We believe this dataset will serve as a
+catalyst for innovative research and bridge the gap between academia and
+industry in understanding the S&R services in practical, real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2023 resource track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-domain Recommendation with Embedding Disentangling and Domain
+  Alignment <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05508v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05508v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Ning, Xiao Yan, Weiwen Liu, Reynold Cheng, Rui Zhang, Bo Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-domain recommendation (MDR) aims to provide recommendations for
+different domains (e.g., types of products) with overlapping users/items and is
+common for platforms such as Amazon, Facebook, and LinkedIn that host multiple
+services. Existing MDR models face two challenges: First, it is difficult to
+disentangle knowledge that generalizes across domains (e.g., a user likes cheap
+items) and knowledge specific to a single domain (e.g., a user likes blue
+clothing but not blue cars). Second, they have limited ability to transfer
+knowledge across domains with small overlaps. We propose a new MDR method named
+EDDA with two key components, i.e., embedding disentangling recommender and
+domain alignment, to tackle the two challenges respectively. In particular, the
+embedding disentangling recommender separates both the model and embedding for
+the inter-domain part and the intra-domain part, while most existing MDR
+methods only focus on model-level disentangling. The domain alignment leverages
+random walks from graph processing to identify similar user/item pairs from
+different domains and encourages similar user/item pairs to have similar
+embeddings, enhancing knowledge transfer. We compare EDDA with 12
+state-of-the-art baselines on 3 real datasets. The results show that EDDA
+consistently outperforms the baselines on all datasets and domains. All
+datasets and codes are available at https://github.com/Stevenn9981/EDDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM'23 as a Long paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">95</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSED: Sound Event Detection with Denoising Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Swapnil Bhosale, Sauradip Nag, Diptesh Kanojia, Jiankang Deng, Xiatian Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sound Event Detection (SED) aims to predict the temporal boundaries of all
+the events of interest and their class labels, given an unconstrained audio
+sample. Taking either the splitand-classify (i.e., frame-level) strategy or the
+more principled event-level modeling approach, all existing methods consider
+the SED problem from the discriminative learning perspective. In this work, we
+reformulate the SED problem by taking a generative learning perspective.
+Specifically, we aim to generate sound temporal boundaries from noisy proposals
+in a denoising diffusion process, conditioned on a target audio sample. During
+training, our model learns to reverse the noising process by converting noisy
+latent queries to the groundtruth versions in the elegant Transformer decoder
+framework. Doing so enables the model generate accurate event boundaries from
+even noisy queries during inference. Extensive experiments on the Urban-SED and
+EPIC-Sounds datasets demonstrate that our model significantly outperforms
+existing alternatives, with 40+% faster convergence in training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Devil is in the Errors: Leveraging Large Language Models for
+  Fine-grained Machine Translation Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Fernandes, Daniel Deutsch, Mara Finkelstein, Parker Riley, André F. T. Martins, Graham Neubig, Ankush Garg, Jonathan H. Clark, Markus Freitag, Orhan Firat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation of machine translation (MT) is a critical tool driving
+the rapid iterative development of MT systems. While considerable progress has
+been made on estimating a single scalar quality score, current metrics lack the
+informativeness of more detailed schemes that annotate individual errors, such
+as Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap
+by proposing AutoMQM, a prompting technique which leverages the reasoning and
+in-context learning capabilities of large language models (LLMs) and asks them
+to identify and categorize errors in translations. We start by evaluating
+recent LLMs, such as PaLM and PaLM-2, through simple score prediction
+prompting, and we study the impact of labeled data through in-context learning
+and finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that
+it improves performance compared to just prompting for scores (with
+particularly large gains for larger models) while providing interpretability
+through error spans that align with human annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Attribute Matrix Factorization Model with Shared User Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen Liang, Zeng Fan, Youzhi Liang, Jianguo Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, deep learning has firmly established its prowess
+across various domains, including computer vision, speech recognition, and
+natural language processing. Motivated by its outstanding success, researchers
+have been directing their efforts towards applying deep learning techniques to
+recommender systems. Neural collaborative filtering (NCF) and Neural Matrix
+Factorization (NeuMF) refreshes the traditional inner product in matrix
+factorization with a neural architecture capable of learning complex and
+data-driven functions. While these models effectively capture user-item
+interactions, they overlook the specific attributes of both users and items.
+This can lead to robustness issues, especially for items and users that belong
+to the "long tail". Such challenges are commonly recognized in recommender
+systems as a part of the cold-start problem. A direct and intuitive approach to
+address this issue is by leveraging the features and attributes of the items
+and users themselves. In this paper, we introduce a refined NeuMF model that
+considers not only the interaction between users and items, but also acrossing
+associated attributes. Moreover, our proposed architecture features a shared
+user embedding, seamlessly integrating with user embeddings to imporve the
+robustness and effectively address the cold-start problem. Rigorous experiments
+on both the Movielens and Pinterest datasets demonstrate the superiority of our
+Cross-Attribute Matrix Factorization model, particularly in scenarios
+characterized by higher dataset sparsity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Efficient Energy-Aware Participant Selection for UAV-Enabled
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssra Cheriguene, Wael Jaafar, Chaker Abdelaziz Kerrache, Halim Yanikomeroglu, Fatima Zohra Bousbaa, Nasreddine Lagraa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned aerial vehicle (UAV)-enabled edge federated learning (FL) has
+sparked a rise in research interest as a result of the massive and
+heterogeneous data collected by UAVs, as well as the privacy concerns related
+to UAV data transmissions to edge servers. However, due to the redundancy of
+UAV collected data, e.g., imaging data, and non-rigorous FL participant
+selection, the convergence time of the FL learning process and bias of the FL
+model may increase. Consequently, we investigate in this paper the problem of
+selecting UAV participants for edge FL, aiming to improve the FL model's
+accuracy, under UAV constraints of energy consumption, communication quality,
+and local datasets' heterogeneity. We propose a novel UAV participant selection
+scheme, called data-efficient energy-aware participant selection strategy
+(DEEPS), which consists of selecting the best FL participant in each sub-region
+based on the structural similarity index measure (SSIM) average score of its
+local dataset and its power consumption profile. Through experiments, we
+demonstrate that the proposed selection scheme is superior to the benchmark
+random selection method, in terms of model accuracy, training time, and UAV
+energy consumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialogue for <span class="highlight-title">Prompt</span>ing: a Policy-Gradient-Based Discrete <span class="highlight-title">Prompt</span>
+  Optimization for Few-shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzhengxu Li, Xiaoming Liu, Yichen Wang, Duyi Li, Yu Lan, Chao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt-based pre-trained language models (PLMs) paradigm have succeeded
+substantially in few-shot natural language processing (NLP) tasks. However,
+prior discrete prompt optimization methods require expert knowledge to design
+the base prompt set and identify high-quality prompts, which is costly,
+inefficient, and subjective. Meanwhile, existing continuous prompt optimization
+methods improve the performance by learning the ideal prompts through the
+gradient information of PLMs, whose high computational cost, and low
+readability and generalizability are often concerning. To address the research
+gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt
+Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment
+strategy for readability prompt set generation based on GPT-4. Furthermore, we
+propose an efficient prompt screening metric to identify high-quality prompts
+with linear complexity. Finally, we construct a reinforcement learning (RL)
+framework based on policy gradients to match the prompts to inputs optimally.
+By training a policy network with only 0.67% of the PLM parameter size on the
+tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)
+method by 1.52% in accuracy on average on four open-source datasets. Moreover,
+subsequent experiments also demonstrate that $DP_2O$ has good universality,
+robustness, and generalization ability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Ningyu Zhang, Xin Xie, Yunzhi Yao, Bozhong Tian, Mengru Wang, Zekun Xi, Siyuan Cheng, Kangwei Liu, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy
+issues, which means they are unaware of unseen events or generate text with
+incorrect facts owing to the outdated/noisy data. To this end, many knowledge
+editing approaches for LLMs have emerged -- aiming to subtly inject/edit
+updated knowledge or adjust undesired behavior while minimizing the impact on
+unrelated inputs. Nevertheless, due to significant differences among various
+knowledge editing methods and the variations in task setups, there is no
+standard implementation framework available for the community, which hinders
+practitioners to apply knowledge editing to applications. To address these
+issues, we propose EasyEdit, an easy-to-use knowledge editing framework for
+LLMs. It supports various cutting-edge knowledge editing approaches and can be
+readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.
+Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,
+demonstrating that knowledge editing surpasses traditional fine-tuning in terms
+of reliability and generalization. We have released the source code on GitHub
+at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and
+comprehensive documentation for beginners to get started. Besides, we present
+an online system for real-time knowledge editing, and a demo video at
+http://knowlm.zjukg.cn/easyedit.mp4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project website is https://github.com/zjunlp/EasyEdit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LCE -- An Augmented Combination of Bagging and Boosting in Python 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Fauvel, Élisa Fromont, Véronique Masson, Philippe Faverdin, Alexandre Termier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  lcensemble is a high-performing, scalable and user-friendly Python package
+for the general tasks of classification and regression. The package implements
+Local Cascade Ensemble (LCE), a machine learning method that further enhances
+the prediction performance of the current state-of-the-art methods Random
+Forest and XGBoost. LCE combines their strengths and adopts a complementary
+diversification approach to obtain a better generalizing predictor. The package
+is compatible with scikit-learn, therefore it can interact with scikit-learn
+pipelines and model selection tools. It is distributed under the Apache 2.0
+license, and its source code is available at
+https://github.com/LocalCascadeEnsemble/LCE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can we Agree? On the Rashōmon Effect and the Reliability of Post-Hoc
+  Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clement Poiret, Antoine Grigis, Justin Thomas, Marion Noulhiane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Rash\=omon effect poses challenges for deriving reliable knowledge from
+machine learning models. This study examined the influence of sample size on
+explanations from models in a Rash\=omon set using SHAP. Experiments on 5
+public datasets showed that explanations gradually converged as the sample size
+increased. Explanations from <128 samples exhibited high variability, limiting
+reliable knowledge extraction. However, agreement between models improved with
+more data, allowing for consensus. Bagging ensembles often had higher
+agreement. The results provide guidance on sufficient data to trust
+explanations. Variability at low samples suggests that conclusions may be
+unreliable without validation. Further work is needed with more model types,
+data domains, and explanation methods. Testing convergence in neural networks
+and with model-specific explanation methods would be impactful. The approaches
+explored here point towards principled techniques for eliciting knowledge from
+ambiguous models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures and 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unifying Generator Loss Function for Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Veiner, Fady Alajaji, Bahman Gharesifard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A unifying $\alpha$-parametrized generator loss function is introduced for a
+dual-objective generative adversarial network (GAN), which uses a canonical (or
+classical) discriminator loss function such as the one in the original GAN
+(VanillaGAN) system. The generator loss function is based on a symmetric class
+probability estimation type function, $\mathcal{L}_\alpha$, and the resulting
+GAN system is termed $\mathcal{L}_\alpha$-GAN. Under an optimal discriminator,
+it is shown that the generator's optimization problem consists of minimizing a
+Jensen-$f_\alpha$-divergence, a natural generalization of the Jensen-Shannon
+divergence, where $f_\alpha$ is a convex function expressed in terms of the
+loss function $\mathcal{L}_\alpha$. It is also demonstrated that this
+$\mathcal{L}_\alpha$-GAN problem recovers as special cases a number of GAN
+problems in the literature, including VanillaGAN, Least Squares GAN (LSGAN),
+Least $k$th order GAN (L$k$GAN) and the recently introduced
+$(\alpha_D,\alpha_G)$-GAN with $\alpha_D=1$. Finally, experimental results are
+conducted on three datasets, MNIST, CIFAR-10, and Stacked MNIST to illustrate
+the performance of various examples of the $\mathcal{L}_\alpha$-GAN system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 4 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distance Matters For Improving Performance Estimation Under Covariate
+  Shift <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mélanie Roschewitz, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance estimation under covariate shift is a crucial component of safe
+AI model deployment, especially for sensitive use-cases. Recently, several
+solutions were proposed to tackle this problem, most leveraging model
+predictions or softmax confidence to derive accuracy estimates. However, under
+dataset shifts, confidence scores may become ill-calibrated if samples are too
+far from the training distribution. In this work, we show that taking into
+account distances of test samples to their expected training distribution can
+significantly improve performance estimation under covariate shift. Precisely,
+we introduce a "distance-check" to flag samples that lie too far from the
+expected distribution, to avoid relying on their untrustworthy model outputs in
+the accuracy estimation step. We demonstrate the effectiveness of this method
+on 13 image classification tasks, across a wide-range of natural and synthetic
+distribution shifts and hundreds of models, with a median relative MAE
+improvement of 27% over the best baseline across all tasks, and SOTA
+performance on 10 out of 13 tasks. Our code is publicly available at
+https://github.com/melanibe/distance_matters_performance_estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV Workshop on Uncertainty Quantification for Computer
+  Vision 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AudioFormer: Audio <span class="highlight-title">Transformer</span> learns audio feature representations from
+  discrete acoustic codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaohui Li, Haitao Wang, Xinghua Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method named AudioFormer, which learns audio feature
+representations through the acquisition of discrete acoustic codes and
+subsequently fine-tunes them for audio classification tasks. Initially, we
+introduce a novel perspective by considering the audio classification task as a
+form of natural language understanding (NLU). Leveraging an existing neural
+audio codec model, we generate discrete acoustic codes and utilize them to
+train a masked language model (MLM), thereby obtaining audio feature
+representations. Furthermore, we pioneer the integration of a
+\textbf{M}ulti-\textbf{P}ositive sample \textbf{C}ontrastive (MPC) learning
+approach. This method enables the learning of joint representations among
+multiple discrete acoustic codes within the same audio input. In our
+experiments, we treat discrete acoustic codes as textual data and train a
+masked language model using a cloze-like methodology, ultimately deriving
+high-quality audio representations. Notably, the MPC learning technique
+effectively captures collaborative representations among distinct positive
+samples. Our research outcomes demonstrate that AudioFormer attains
+significantly improved performance compared to prevailing monomodal audio
+classification models across multiple datasets, and even outperforms
+audio-visual multimodal classification models on select datasets. Specifically,
+our approach achieves remarkable results on datasets including AudioSet (2M,
+20K), and FSD50K, with performance scores of 53.9, 45.1, and 65.6,
+respectively. We have openly shared both the code and models:
+\url{https://github.com/LZH-0225/AudioFormer.git}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel
+  Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashidhar Reddy Javaji, Sovesh Mohapatra, Advait Gosai, Gottfried Schlaug
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tumors remain a critical global health challenge, necessitating
+advancements in diagnostic techniques and treatment methodologies. In response
+to the growing need for age-specific segmentation models, particularly for
+pediatric patients, this study explores the deployment of deep learning
+techniques using magnetic resonance imaging (MRI) modalities. By introducing a
+novel ensemble approach using ONet and modified versions of UNet, coupled with
+innovative loss functions, this study achieves a precise segmentation model for
+the BraTS-PEDs 2023 Challenge. Data augmentation, including both single and
+composite transformations, ensures model robustness and accuracy across
+different scanning protocols. The ensemble strategy, integrating the ONet and
+UNet models, shows greater effectiveness in capturing specific features and
+modeling diverse aspects of the MRI images which result in lesion_wise dice
+scores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor
+labels respectively. Visual comparisons further confirm the superiority of the
+ensemble method in accurate tumor region coverage. The results indicate that
+this advanced ensemble approach, building upon the unique strengths of
+individual models, offers promising prospects for enhanced diagnostic accuracy
+and effective treatment planning for brain tumors in pediatric brains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 Figs, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Data-Free Compression: Pruning and Quantization without
+  Fine-Tuning <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shipeng Bai, Jun Chen, Xintian Shen, Yixuan Qian, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured pruning and quantization are promising approaches for reducing the
+inference time and memory footprint of neural networks. However, most existing
+methods require the original training dataset to fine-tune the model. This not
+only brings heavy resource consumption but also is not possible for
+applications with sensitive or proprietary data due to privacy and security
+concerns. Therefore, a few data-free methods are proposed to address this
+problem, but they perform data-free pruning and quantization separately, which
+does not explore the complementarity of pruning and quantization. In this
+paper, we propose a novel framework named Unified Data-Free Compression(UDFC),
+which performs pruning and quantization simultaneously without any data and
+fine-tuning process. Specifically, UDFC starts with the assumption that the
+partial information of a damaged(e.g., pruned or quantized) channel can be
+preserved by a linear combination of other channels, and then derives the
+reconstruction form from the assumption to restore the information loss due to
+compression. Finally, we formulate the reconstruction error between the
+original network and its compressed network, and theoretically deduce the
+closed-form solution. We evaluate the UDFC on the large-scale image
+classification task and obtain significant improvements over various network
+architectures and compression methods. For example, we achieve a 20.54%
+accuracy improvement on ImageNet dataset compared to SOTA method with 30%
+pruning ratio and 6-bit quantization on ResNet-34.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algorithms for the Training of Neural Support Vector Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Simon, Manuel Radons
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural support vector machines (NSVMs) allow for the incorporation of domain
+knowledge in the design of the model architecture. In this article we introduce
+a set of training algorithms for NSVMs that leverage the Pegasos algorithm and
+provide a proof of concept by solving a set of standard machine learning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 0 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Categorical Priors for Physics-Based Character Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingxu Zhu, He Zhang, Mengting Lan, Lei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in learning reusable motion priors have demonstrated their
+effectiveness in generating naturalistic behaviors. In this paper, we propose a
+new learning framework in this paradigm for controlling physics-based
+characters with significantly improved motion quality and diversity over
+existing state-of-the-art methods. The proposed method uses reinforcement
+learning (RL) to initially track and imitate life-like movements from
+unstructured motion clips using the discrete information bottleneck, as adopted
+in the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure
+compresses the most relevant information from the motion clips into a compact
+yet informative latent space, i.e., a discrete space over vector quantized
+codes. By sampling codes in the space from a trained categorical prior
+distribution, high-quality life-like behaviors can be generated, similar to the
+usage of VQ-VAE in computer vision. Although this prior distribution can be
+trained with the supervision of the encoder's output, it follows the original
+motion clip distribution in the dataset and could lead to imbalanced behaviors
+in our setting. To address the issue, we further propose a technique named
+prior shifting to adjust the prior distribution using curiosity-driven RL. The
+outcome distribution is demonstrated to offer sufficient behavioral diversity
+and significantly facilitates upper-level policy learning for downstream tasks.
+We conduct comprehensive experiments using humanoid characters on two
+challenging downstream tasks, sword-shield striking and two-player boxing game.
+Our results demonstrate that the proposed framework is capable of controlling
+the character to perform considerably high-quality movements in terms of
+behavioral strategies, diversity, and realism. Videos, codes, and data are
+available at https://tencent-roboticsx.github.io/NCP/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Black-Box Models through Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Altmeyer, Arie van Deursen, Cynthia C. S. Liem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CounterfactualExplanations.jl: a package for generating
+Counterfactual Explanations (CE) and Algorithmic Recourse (AR) for black-box
+models in Julia. CE explain how inputs into a model need to change to yield
+specific model predictions. Explanations that involve realistic and actionable
+changes can be used to provide AR: a set of proposed actions for individuals to
+change an undesirable outcome for the better. In this article, we discuss the
+usefulness of CE for Explainable Artificial Intelligence and demonstrate the
+functionality of our package. The package is straightforward to use and
+designed with a focus on customization and extensibility. We envision it to one
+day be the go-to place for explaining arbitrary predictive models in Julia
+through a diverse suite of counterfactual generators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, originally published in The Proceedings of the
+  JuliaCon Conferences (JCON)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ gSASRec: Reducing Overconfidence in Sequential Recommendation Trained
+  with Negative Sampling <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandr Petrov, Craig Macdonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A large catalogue size is one of the central challenges in training
+recommendation models: a large number of items makes them memory and
+computationally inefficient to compute scores for all items during training,
+forcing these models to deploy negative sampling. However, negative sampling
+increases the proportion of positive interactions in the training data, and
+therefore models trained with negative sampling tend to overestimate the
+probabilities of positive interactions a phenomenon we call overconfidence.
+While the absolute values of the predicted scores or probabilities are not
+important for the ranking of retrieved recommendations, overconfident models
+may fail to estimate nuanced differences in the top-ranked items, resulting in
+degraded performance. In this paper, we show that overconfidence explains why
+the popular SASRec model underperforms when compared to BERT4Rec. This is
+contrary to the BERT4Rec authors explanation that the difference in performance
+is due to the bi-directional attention mechanism. To mitigate overconfidence,
+we propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and
+theoretically prove that it can mitigate overconfidence. We further propose the
+gSASRec model, an improvement over SASRec that deploys an increased number of
+negatives and the gBCE loss. We show through detailed experiments on three
+datasets that gSASRec does not exhibit the overconfidence problem. As a result,
+gSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),
+while requiring less training time (e.g. -73% training time on MovieLens-1M).
+Moreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that
+contain more than 1 million items.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM RecSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Learning of Quantum States Prepared With Few Non-Clifford
+  Gates II: Single-Copy Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sabee Grewal, Vishnu Iyer, William Kretschmer, Daniel Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that $n$-qubit quantum states output by circuits with
+at most $t$ single-qubit non-Clifford gates can be learned to trace distance
+$\epsilon$ using $\mathsf{poly}(n,2^t,1/\epsilon)$ time and samples. All prior
+algorithms achieving this runtime use entangled measurements across two copies
+of the input state. In this work, we give a similarly efficient algorithm that
+learns the same class of states using only single-copy measurements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages. arXiv admin note: text overlap with arXiv:2305.13409</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PitchNet: A Fully Convolutional Neural Network for Pitch Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremy Cochoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of music and sound processing, pitch extraction plays a pivotal
+role. This research introduces "PitchNet", a convolutional neural network
+tailored for pitch extraction from the human singing voice, including acapella
+performances. Integrating autocorrelation with deep learning techniques,
+PitchNet aims to optimize the accuracy of pitch detection. Evaluation across
+datasets comprising synthetic sounds, opera recordings, and time-stretched
+vowels demonstrates its efficacy. This work paves the way for enhanced pitch
+extraction in both music and voice settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pairing interacting protein sequences using masked language modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umberto Lupo, Damiano Sgarbossa, Anne-Florence Bitbol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting which proteins interact together from amino-acid sequences is an
+important task. We develop a method to pair interacting protein sequences which
+leverages the power of protein language models trained on multiple sequence
+alignments, such as MSA Transformer and the EvoFormer module of AlphaFold. We
+formulate the problem of pairing interacting partners among the paralogs of two
+protein families in a differentiable way. We introduce a method called DiffPALM
+that solves it by exploiting the ability of MSA Transformer to fill in masked
+amino acids in multiple sequence alignments using the surrounding context. MSA
+Transformer encodes coevolution between functionally or structurally coupled
+amino acids. We show that it captures inter-chain coevolution, while it was
+trained on single-chain data, which means that it can be used
+out-of-distribution. Relying on MSA Transformer without fine-tuning, DiffPALM
+outperforms existing coevolution-based pairing methods on difficult benchmarks
+of shallow multiple sequence alignments extracted from ubiquitous prokaryotic
+protein datasets. It also outperforms an alternative method based on a
+state-of-the-art protein language model trained on single sequences. Paired
+alignments of interacting protein sequences are a crucial ingredient of
+supervised deep learning methods to predict the three-dimensional structure of
+protein complexes. DiffPALM substantially improves the structure prediction of
+some eukaryotic protein complexes by AlphaFold-Multimer, without significantly
+deteriorating any of those we tested. It also achieves competitive performance
+with using orthology-based pairing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 14 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Natural Language is All a Graph Needs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruosong Ye, Caiqi Zhang, Runhui Wang, Shuyuan Xu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large-scale pre-trained language models, such as ChatGPT,
+has revolutionized various research fields in artificial intelligence.
+Transformers-based large language models (LLMs) have gradually replaced CNNs
+and RNNs to unify fields of computer vision and natural language processing.
+Compared with the data that exists relatively independently such as images,
+videos or texts, graph is a type of data that contains rich structural and
+relational information. Meanwhile, natural language, as one of the most
+expressive mediums, excels in describing complex structures. However, existing
+work on incorporating graph learning problems into the generative language
+modeling framework remains very limited. As the importance of language models
+continues to grow, it becomes essential to explore whether LLMs can also
+replace GNNs as the foundational model for graphs. In this paper, we propose
+InstructGLM (Instruction-finetuned Graph Language Model), systematically design
+highly scalable prompts based on natural language instructions, and use natural
+language to describe the geometric structure and node features of the graph for
+instruction tuning an LLMs to perform learning and inference on graphs in a
+generative manner. Our method exceeds all competitive GNN baselines on
+ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of
+our method and sheds light on generative language models replacing GNNs as the
+foundation model for graph machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Time-aware tensor decomposition for tracking evolving patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Chatzis, Max Pfeffer, Pedro Lind, Evrim Acar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-evolving data sets can often be arranged as a higher-order tensor with
+one of the modes being the time mode. While tensor factorizations have been
+successfully used to capture the underlying patterns in such higher-order data
+sets, the temporal aspect is often ignored, allowing for the reordering of time
+points. In recent studies, temporal regularizers are incorporated in the time
+mode to tackle this issue. Nevertheless, existing approaches still do not allow
+underlying patterns to change in time (e.g., spatial changes in the brain,
+contextual changes in topics). In this paper, we propose temporal PARAFAC2
+(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal
+regularization to extract gradually evolving patterns from temporal data.
+Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2
+can capture the underlying evolving patterns accurately performing better than
+PARAFAC2 and coupled matrix factorization with temporal smoothness
+regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with
+  <span class="highlight-title">Transformer</span>s <span class="chip">ECAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Rauch, Raphael Schwinger, Moritz Wirth, Bernhard Sick, Sven Tomforde, Christoph Scholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a shift towards end-to-end learning in bird sound monitoring by
+combining self-supervised (SSL) and deep active learning (DAL). Leveraging
+transformer models, we aim to bypass traditional spectrogram conversions,
+enabling direct raw audio processing. ActiveBird2Vec is set to generate
+high-quality bird sound representations through SSL, potentially accelerating
+the assessment of environmental changes and decision-making processes for wind
+farms. Additionally, we seek to utilize the wide variety of bird vocalizations
+through DAL, reducing the reliance on extensively labeled datasets by human
+experts. We plan to curate a comprehensive set of tasks through Huggingface
+Datasets, enhancing future comparability and reproducibility of bioacoustic
+research. A comparative analysis between various transformer models will be
+conducted to evaluate their proficiency in bird sound recognition tasks. We aim
+to accelerate the progression of avian bioacoustic research and contribute to
+more effective conservation strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @AI4S ECAI2023. This is the author's version of the work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural radiance fields in the industrial and robotics domain:
+  applications, research opportunities and use cases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugen Šlapak, Enric Pardo, Matúš Dopiriak, Taras Maksymyuk, Juraj Gazda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of technologies, such as extended reality (XR), has
+increased the demand for high-quality three-dimensional (3D) graphical
+representations. Industrial 3D applications encompass computer-aided design
+(CAD), finite element analysis (FEA), scanning, and robotics. However, current
+methods employed for industrial 3D representations suffer from high
+implementation costs and reliance on manual human input for accurate 3D
+modeling. To address these challenges, neural radiance fields (NeRFs) have
+emerged as a promising approach for learning 3D scene representations based on
+provided training 2D images. Despite a growing interest in NeRFs, their
+potential applications in various industrial subdomains are still unexplored.
+In this paper, we deliver a comprehensive examination of NeRF industrial
+applications while also providing direction for future research endeavors. We
+also present a series of proof-of-concept experiments that demonstrate the
+potential of NeRFs in the industrial domain. These experiments include
+NeRF-based video compression techniques and using NeRFs for 3D motion
+estimation in the context of collision avoidance. In the video compression
+experiment, our results show compression savings up to 48\% and 74\% for
+resolutions of 1920x1080 and 300x168, respectively. The motion estimation
+experiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF)
+and achieved an average disparity map PSNR of 23 dB and an SSIM of 0.97. The
+code for our experiments is publicly available at
+https://github.com/Maftej/iisnerf .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ iSTFTNet2: Faster and More Lightweight iSTFT-Based Neural Vocoder Using
+  1D-2D CNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuhiro Kaneko, Hirokazu Kameoka, Kou Tanaka, Shogo Seki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inverse short-time Fourier transform network (iSTFTNet) has garnered
+attention owing to its fast, lightweight, and high-fidelity speech synthesis.
+It obtains these characteristics using a fast and lightweight 1D CNN as the
+backbone and replacing some neural processes with iSTFT. Owing to the
+difficulty of a 1D CNN to model high-dimensional spectrograms, the frequency
+dimension is reduced via temporal upsampling. However, this strategy
+compromises the potential to enhance the speed. Therefore, we propose
+iSTFTNet2, an improved variant of iSTFTNet with a 1D-2D CNN that employs 1D and
+2D CNNs to model temporal and spectrogram structures, respectively. We designed
+a 2D CNN that performs frequency upsampling after conversion in a few-frequency
+space. This design facilitates the modeling of high-dimensional spectrograms
+without compromising the speed. The results demonstrated that iSTFTNet2 made
+iSTFTNet faster and more lightweight with comparable speech quality. Audio
+samples are available at
+https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Interspeech 2023. Project page:
+  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ #InsTag: Instruction Tagging for Diversity and Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keming Lu, Hongyi Yuan, Zheng Yuan, Runji Lin, Junyang Lin, Chuanqi Tan, Chang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation language models obtain the instruction-following ability through
+supervised fine-tuning (SFT). Diversity and complexity are considered critical
+factors of a successful SFT dataset, while their definitions remain obscure and
+lack quantitative analyses. In this work, we propose InsTag, an open-set
+fine-grained tagger, to tag samples within SFT datasets based on semantics and
+intentions and define instruction diversity and complexity regarding tags. We
+obtain 6.6K tags to describe comprehensive user queries. Then we analyze
+popular open-sourced SFT datasets and find that the model ability grows with
+more diverse and complex data. Based on this observation, we propose a data
+selector based on InsTag to select 6K diverse and complex samples from
+open-source datasets and fine-tune models on InsTag-selected data. The
+resulting models, TagLM, outperform open-source models based on considerably
+larger SFT data evaluated by MT-Bench, echoing the importance of query
+diversity and complexity. We open-source InsTag in
+https://github.com/OFA-Sys/InsTag.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Unlearning: Solutions and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Xu, Zihan Wu, Cong Wang, Xiaohua Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models may inadvertently memorize sensitive, unauthorized,
+or malicious data, posing risks of privacy violations, security breaches, and
+performance deterioration. To address these issues, machine unlearning has
+emerged as a critical technique to selectively remove specific training data
+points' influence on trained models. This paper provides a comprehensive
+taxonomy and analysis of machine unlearning research. We categorize existing
+research into exact unlearning that algorithmically removes data influence
+entirely and approximate unlearning that efficiently minimizes influence
+through limited parameter updates. By reviewing the state-of-the-art solutions,
+we critically discuss their advantages and limitations. Furthermore, we propose
+future directions to advance machine unlearning and establish it as an
+essential capability for trustworthy and adaptive machine learning. This paper
+provides researchers with a roadmap of open problems, encouraging impactful
+contributions to address real-world needs for selective data removal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagnosis of Scalp Disorders using Machine Learning and Deep Learning
+  Approach -- A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishabh Tiwari, Jatin Moolchandani, Shamla Mantri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The morbidity of scalp diseases is minuscule compared to other diseases, but
+the impact on the patient's life is enormous. It is common for people to
+experience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,
+Alopecia and Atopic-Dermatitis. In accordance with WHO research, approximately
+70% of adults have problems with their scalp. It has been demonstrated in
+descriptive research that hair quality is impaired by impaired scalp, but these
+impacts are reversible with early diagnosis and treatment. Deep Learning
+advances have demonstrated the effectiveness of CNN paired with FCN in
+diagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp
+inspection and diagnosis system, an imaging microscope and a trained model are
+combined with an app that classifies scalp disorders accurately with an average
+precision of 97.41%- 99.09%. Another research dealt with classifying the
+Psoriasis using the CNN with an accuracy of 82.9%. As part of another study, an
+ML based algorithm was also employed. It accurately classified the healthy
+scalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN
+algorithms. Using deep learning models to diagnose scalp related diseases has
+improved due to advancements i computation capabilities and computer vision,
+but there remains a wide horizon for further improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier neural operator for learning solutions to macroscopic traffic
+  flow models: Application to the forward and inverse problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bilal Thonnam Thodi, Sai Venkata Ramana Ambadipudi, Saif Eddin Jabari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods are emerging as popular computational tools for solving
+forward and inverse problems in traffic flow. In this paper, we study a neural
+operator framework for learning solutions to nonlinear hyperbolic partial
+differential equations with applications in macroscopic traffic flow models. In
+this framework, an operator is trained to map heterogeneous and sparse traffic
+input data to the complete macroscopic traffic state in a supervised learning
+setting. We chose a physics-informed Fourier neural operator ($\pi$-FNO) as the
+operator, where an additional physics loss based on a discrete conservation law
+regularizes the problem during training to improve the shock predictions. We
+also propose to use training data generated from random piecewise constant
+input data to systematically capture the shock and rarefied solutions. From
+experiments using the LWR traffic flow model, we found superior accuracy in
+predicting the density dynamics of a ring-road network and urban signalized
+road. We also found that the operator can be trained using simple traffic
+density dynamics, e.g., consisting of $2-3$ vehicle queues and $1-2$ traffic
+signal cycles, and it can predict density dynamics for heterogeneous vehicle
+queue distributions and multiple traffic signal cycles $(\geq 2)$ with an
+acceptable error. The extrapolation error grew sub-linearly with input
+complexity for a proper choice of the model architecture and training data.
+Adding a physics regularizer aided in learning long-term traffic density
+dynamics, especially for problems with periodic boundary data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UIPC-MF: User-Item Prototype Connection Matrix Factorization for
+  Explainable Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Pan, Von-Wun Soo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommending items to potentially interested users has been an important
+commercial task that faces two main challenges: accuracy and explainability.
+While most collaborative filtering models rely on statistical computations on a
+large scale of interaction data between users and items and can achieve high
+performance, they often lack clear explanatory power. We propose UIPC-MF, a
+prototype-based matrix factorization method for explainable collaborative
+filtering recommendations. In UIPC-MF, both users and items are associated with
+sets of prototypes, capturing general collaborative attributes. To enhance
+explainability, UIPC-MF learns connection weights that reflect the associative
+relations between user and item prototypes for recommendations. UIPC-MF
+outperforms other prototype-based baseline methods in terms of Hit Ratio and
+Normalized Discounted Cumulative Gain on three datasets, while also providing
+better transparency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No Regularization is Needed: An Efficient and Effective Model for
+  Incomplete Label Distribution Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Songcan Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label Distribution Learning (LDL) assigns soft labels, a.k.a. degrees, to a
+sample. In reality, it is always laborious to obtain complete degrees, giving
+birth to the Incomplete LDL (InLDL). However, InLDL often suffers from
+performance degeneration. To remedy it, existing methods need one or more
+explicit regularizations, leading to burdensome parameter tuning and extra
+computation. We argue that label distribution itself may provide useful prior,
+when used appropriately, the InLDL problem can be solved without any explicit
+regularization. In this paper, we offer a rational alternative to use such a
+prior. Our intuition is that large degrees are likely to get more concern, the
+small ones are easily overlooked, whereas the missing degrees are completely
+neglected in InLDL. To learn an accurate label distribution, it is crucial not
+to ignore the small observed degrees but to give them properly large weights,
+while gradually increasing the weights of the missing degrees. To this end, we
+first define a weighted empirical risk and derive upper bounds between the
+expected risk and the weighted empirical risk, which reveals in principle that
+weighting plays an implicit regularization role. Then, by using the prior of
+degrees, we design a weighted scheme and verify its effectiveness. To sum up,
+our model has four advantages, it is 1) model selection free, as no explicit
+regularization is imposed; 2) with closed form solution (sub-problem) and
+easy-to-implement (a few lines of codes); 3) with linear computational
+complexity in the number of samples, thus scalable to large datasets; 4)
+competitive with state-of-the-arts even without any explicit regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Flow Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Graves, Rupesh Kumar Srivastava, Timothy Atkinson, Faustino Gomez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Bayesian Flow Networks (BFNs), a new class of
+generative model in which the parameters of a set of independent distributions
+are modified with Bayesian inference in the light of noisy data samples, then
+passed as input to a neural network that outputs a second, interdependent
+distribution. Starting from a simple prior and iteratively updating the two
+distributions yields a generative procedure similar to the reverse process of
+diffusion models; however it is conceptually simpler in that no forward process
+is required. Discrete and continuous-time loss functions are derived for
+continuous, discretised and discrete data, along with sample generation
+procedures. Notably, the network inputs for discrete data lie on the
+probability simplex, and are therefore natively differentiable, paving the way
+for gradient-based sample guidance and few-step generation in discrete domains
+such as language modelling. The loss function directly optimises data
+compression and places no restrictions on the network architecture. In our
+experiments BFNs achieve competitive log-likelihoods for image modelling on
+dynamically binarized MNIST and CIFAR-10, and outperform all known discrete
+diffusion models on the text8 character-level language modelling task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S3IM: Stochastic Structural SIMilarity and Its Unreasonable
+  Effectiveness for Neural Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeke Xie, Xindi Yang, Yujie Yang, Qi Sun, Yixiang Jiang, Haoran Wang, Yunfeng Cai, Mingming Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Neural Radiance Field (NeRF) has shown great success in rendering
+novel-view images of a given scene by learning an implicit representation with
+only posed RGB images. NeRF and relevant neural field methods (e.g., neural
+surface representation) typically optimize a point-wise loss and make
+point-wise predictions, where one data point corresponds to one pixel.
+Unfortunately, this line of research failed to use the collective supervision
+of distant pixels, although it is known that pixels in an image or scene can
+provide rich structural information. To the best of our knowledge, we are the
+first to design a nonlocal multiplex training paradigm for NeRF and relevant
+neural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss
+that processes multiple data points as a whole set instead of process multiple
+inputs independently. Our extensive experiments demonstrate the unreasonable
+effectiveness of S3IM in improving NeRF and neural surface representation for
+nearly free. The improvements of quality metrics can be particularly
+significant for those relatively difficult tasks: e.g., the test MSE loss
+unexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view
+synthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance
+reduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is
+consistently robust even with sparse inputs, corrupted images, and dynamic
+scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14
+  pages, 5 figures, 17 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Optimize LSM-trees: Towards A Reinforcement Learning based
+  Key-Value Store for Dynamic Workloads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingheng Mo, Fanchao Chen, Siqiang Luo, Caihua Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LSM-trees are widely adopted as the storage backend of key-value stores.
+However, optimizing the system performance under dynamic workloads has not been
+sufficiently studied or evaluated in previous work. To fill the gap, we present
+RusKey, a key-value store with the following new features: (1) RusKey is a
+first attempt to orchestrate LSM-tree structures online to enable robust
+performance under the context of dynamic workloads; (2) RusKey is the first
+study to use Reinforcement Learning (RL) to guide LSM-tree transformations; (3)
+RusKey includes a new LSM-tree design, named FLSM-tree, for an efficient
+transition between different compaction policies -- the bottleneck of dynamic
+key-value stores. We justify the superiority of the new design with theoretical
+analysis; (4) RusKey requires no prior workload knowledge for system
+adjustment, in contrast to state-of-the-art techniques. Experiments show that
+RusKey exhibits strong performance robustness in diverse workloads, achieving
+up to 4x better end-to-end performance than the RocksDB system under various
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Greedy online change point detection <span class="chip">SP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jou-Hui Ho, Felipe Tobar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard online change point detection (CPD) methods tend to have large false
+discovery rates as their detections are sensitive to outliers. To overcome this
+drawback, we propose Greedy Online Change Point Detection (GOCPD), a
+computationally appealing method which finds change points by maximizing the
+probability of the data coming from the (temporal) concatenation of two
+independent models. We show that, for time series with a single change point,
+this objective is unimodal and thus CPD can be accelerated via ternary search
+with logarithmic complexity. We demonstrate the effectiveness of GOCPD on
+synthetic data and validate our findings on real-world univariate and
+multivariate settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE MLSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep convolutional neural networks for cyclic sensor data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Payman Goodarzi, Yannick Robin, Andreas Schütze, Tizian Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive maintenance plays a critical role in ensuring the uninterrupted
+operation of industrial systems and mitigating the potential risks associated
+with system failures. This study focuses on sensor-based condition monitoring
+and explores the application of deep learning techniques using a hydraulic
+system testbed dataset. Our investigation involves comparing the performance of
+three models: a baseline model employing conventional methods, a single CNN
+model with early sensor fusion, and a two-lane CNN model (2L-CNN) with late
+sensor fusion. The baseline model achieves an impressive test error rate of 1%
+by employing late sensor fusion, where feature extraction is performed
+individually for each sensor. However, the CNN model encounters challenges due
+to the diverse sensor characteristics, resulting in an error rate of 20.5%. To
+further investigate this issue, we conduct separate training for each sensor
+and observe variations in accuracy. Additionally, we evaluate the performance
+of the 2L-CNN model, which demonstrates significant improvement by reducing the
+error rate by 33% when considering the combination of the least and most
+optimal sensors. This study underscores the importance of effectively
+addressing the complexities posed by multi-sensor systems in sensor-based
+condition monitoring.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures, submitted to the IEEE Sensors Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based
+  Unsupervised Representation Learning Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Momojit Biswas, Himanshu Buckchash, Dilip K. Prasad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nearest neighbor (NN) sampling provides more semantic variations than
+pre-defined transformations for self-supervised learning (SSL) based image
+recognition problems. However, its performance is restricted by the quality of
+the support set, which holds positive samples for the contrastive loss. In this
+work, we show that the quality of the support set plays a crucial role in any
+nearest neighbor based method for SSL. We then provide a refined baseline
+(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we
+introduce pseudo nearest neighbors (pNN) to control the quality of the support
+set, wherein, rather than sampling the nearest neighbors, we sample in the
+vicinity of hard nearest neighbors by varying the magnitude of the resultant
+vector and employing a stochastic sampling strategy to improve the performance.
+Additionally, to stabilize the effects of uncertainty in NN-based learning, we
+employ a smooth-weight-update approach for training the proposed network.
+Evaluation of the proposed method on multiple public image recognition and
+medical image recognition datasets shows that it performs up to 8 percent
+better than the baseline nearest neighbor method, and is comparable to other
+previously proposed SSL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Routing Recovery for UAV Networks with Deliberate Attacks: A
+  Reinforcement Learning based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijie He, Ziye Jia, Chao Dong, Wei Wang, Yilu Cao, Yang Yang, Qihui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unmanned aerial vehicle (UAV) network is popular these years due to its
+various applications. In the UAV network, routing is significantly affected by
+the distributed network topology, leading to the issue that UAVs are vulnerable
+to deliberate damage. Hence, this paper focuses on the routing plan and
+recovery for UAV networks with attacks. In detail, a deliberate attack model
+based on the importance of nodes is designed to represent enemy attacks. Then,
+a node importance ranking mechanism is presented, considering the degree of
+nodes and link importance. However, it is intractable to handle the routing
+problem by traditional methods for UAV networks, since link connections change
+with the UAV availability. Hence, an intelligent algorithm based on
+reinforcement learning is proposed to recover the routing path when UAVs are
+attacked. Simulations are conducted and numerical results verify the proposed
+mechanism performs better than other referred methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE GLOBECOM 2023, 6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoAssign+: Automatic Shared Embedding Assignment in Streaming
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Liu, Kecheng Chen, Fengyi Song, Bo Chen, Xiangyu Zhao, Huifeng Guo, Ruiming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of streaming recommender systems, conventional methods for
+addressing new user IDs or item IDs typically involve assigning initial ID
+embeddings randomly. However, this practice results in two practical
+challenges: (i) Items or users with limited interactive data may yield
+suboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs
+necessitates consistently expanding the embedding table, leading to unnecessary
+memory consumption. In light of these concerns, we introduce a reinforcement
+learning-driven framework, namely AutoAssign+, that facilitates Automatic
+Shared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an
+Identity Agent as an actor network, which plays a dual role: (i) Representing
+low-frequency IDs field-wise with a small set of shared embeddings to enhance
+the embedding initialization, and (ii) Dynamically determining which ID
+features should be retained or eliminated in the embedding table. The policy of
+the agent is optimized with the guidance of a critic network. To evaluate the
+effectiveness of our approach, we perform extensive experiments on three
+commonly used benchmark datasets. Our experiment results demonstrate that
+AutoAssign+ is capable of significantly enhancing recommendation performance by
+mitigating the cold-start problem. Furthermore, our framework yields a
+reduction in memory usage of approximately 20-30%, verifying its practical
+effectiveness and efficiency for streaming recommender systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Structural Residuals: A Learning Approach to Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Lukas Augustin, Oliver Niggemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional model-based diagnosis relies on constructing explicit system
+models, a process that can be laborious and expertise-demanding. In this paper,
+we propose a novel framework that combines concepts of model-based diagnosis
+with deep graph structure learning. This data-driven approach leverages data to
+learn the system's underlying structure and provide dynamic observations,
+represented by two distinct graph adjacency matrices. Our work facilitates a
+seamless integration of graph structure learning with model-based diagnosis by
+making three main contributions: (i) redefining the constructs of system
+representation, observations, and faults (ii) introducing two distinct versions
+of a self-supervised graph structure learning model architecture and (iii)
+demonstrating the potential of our data-driven diagnostic method through
+experiments on a system of coupled oscillators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search to Fine-tune <span class="highlight-title">Pre-train</span>ed Graph Neural Networks for Graph-level
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhili Wang, Shimin Di, Lei Chen, Xiaofang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, graph neural networks (GNNs) have shown its unprecedented success
+in many graph-related tasks. However, GNNs face the label scarcity issue as
+other neural networks do. Thus, recent efforts try to pre-train GNNs on a
+large-scale unlabeled graph and adapt the knowledge from the unlabeled graph to
+the target downstream task. The adaptation is generally achieved by fine-tuning
+the pre-trained GNNs with a limited number of labeled data. Despite the
+importance of fine-tuning, current GNNs pre-training works often ignore
+designing a good fine-tuning strategy to better leverage transferred knowledge
+and improve the performance on downstream tasks. Only few works start to
+investigate a better fine-tuning strategy for pre-trained GNNs. But their
+designs either have strong assumptions or overlook the data-aware issue for
+various downstream datasets. Therefore, we aim to design a better fine-tuning
+strategy for pre-trained GNNs to improve the model performance in this paper.
+Given a pre-trained GNN, we propose to search to fine-tune pre-trained graph
+neural networks for graph-level tasks (S2PGNN), which adaptively design a
+suitable fine-tuning framework for the given labeled data on the downstream
+task. To ensure the improvement brought by searching fine-tuning strategy, we
+carefully summarize a proper search space of fine-tuning framework that is
+suitable for GNNs. The empirical studies show that S2PGNN can be implemented on
+the top of 10 famous pre-trained GNNs and consistently improve their
+performance. Besides, S2PGNN achieves better performance than existing
+fine-tuning strategies within and outside the GNN area. Our code is publicly
+available at \url{https://anonymous.4open.science/r/code_icde2024-A9CB/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Driven Allocation of Preventive Care With Application to Diabetes
+  Mellitus Type II 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Kraus, Stefan Feuerriegel, Maytal Saar-Tsechansky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Problem Definition. Increasing costs of healthcare highlight the importance
+of effective disease prevention. However, decision models for allocating
+preventive care are lacking.
+  Methodology/Results. In this paper, we develop a data-driven decision model
+for determining a cost-effective allocation of preventive treatments to
+patients at risk. Specifically, we combine counterfactual inference, machine
+learning, and optimization techniques to build a scalable decision model that
+can exploit high-dimensional medical data, such as the data found in modern
+electronic health records. Our decision model is evaluated based on electronic
+health records from 89,191 prediabetic patients. We compare the allocation of
+preventive treatments (metformin) prescribed by our data-driven decision model
+with that of current practice. We find that if our approach is applied to the
+U.S. population, it can yield annual savings of $1.1 billion. Finally, we
+analyze the cost-effectiveness under varying budget levels.
+  Managerial Implications. Our work supports decision-making in health
+management, with the goal of achieving effective disease prevention at lower
+costs. Importantly, our decision model is generic and can thus be used for
+effective allocation of preventive care for other preventable diseases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Manufacturing & Service Operations Management</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CEmb-SAM: Segment Anything Model with Condition Embedding for Joint
+  Learning from Heterogeneous <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongik Shin, Beomsuk Kim, Seungjun Baek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated segmentation of ultrasound images can assist medical experts with
+diagnostic and therapeutic procedures. Although using the common modality of
+ultrasound, one typically needs separate datasets in order to segment, for
+example, different anatomical structures or lesions with different levels of
+malignancy. In this paper, we consider the problem of jointly learning from
+heterogeneous datasets so that the model can improve generalization abilities
+by leveraging the inherent variability among datasets. We merge the
+heterogeneous datasets into one dataset and refer to each component dataset as
+a subgroup. We propose to train a single segmentation model so that the model
+can adapt to each sub-group. For robust segmentation, we leverage recently
+proposed Segment Anything model (SAM) in order to incorporate sub-group
+information into the model. We propose SAM with Condition Embedding block
+(CEmb-SAM) which encodes sub-group conditions and combines them with image
+embeddings from SAM. The conditional embedding block effectively adapts SAM to
+each image sub-group by incorporating dataset properties through learnable
+parameters for normalization. Experiments show that CEmb-SAM outperforms the
+baseline methods on ultrasound image segmentation for peripheral nerves and
+breast cancer. The experiments highlight the effectiveness of Cemb-SAM in
+learning from heterogeneous datasets in medical image segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Channel-Wise Contrastive Learning for Learning with Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Kang, Sheng Liu, Huaxi Huang, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world datasets, noisy labels are pervasive. The challenge of learning
+with noisy labels (LNL) is to train a classifier that discerns the actual
+classes from given instances. For this, the model must identify features
+indicative of the authentic labels. While research indicates that genuine label
+information is embedded in the learned features of even inaccurately labeled
+data, it's often intertwined with noise, complicating its direct application.
+Addressing this, we introduce channel-wise contrastive learning (CWCL). This
+method distinguishes authentic label information from noise by undertaking
+contrastive learning across diverse channels. Unlike conventional instance-wise
+contrastive learning (IWCL), CWCL tends to yield more nuanced and resilient
+features aligned with the authentic labels. Our strategy is twofold: firstly,
+using CWCL to extract pertinent features to identify cleanly labeled samples,
+and secondly, progressively fine-tuning using these samples. Evaluations on
+several benchmark datasets validate our method's superiority over existing
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowing Where to Focus: Event-aware <span class="highlight-title">Transformer</span> for Video Grounding <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhyun Jang, Jungin Park, Jin Kim, Hyeongjun Kwon, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent DETR-based video grounding models have made the model directly predict
+moment timestamps without any hand-crafted components, such as a pre-defined
+proposal or non-maximum suppression, by learning moment queries. However, their
+input-agnostic moment queries inevitably overlook an intrinsic temporal
+structure of a video, providing limited positional information. In this paper,
+we formulate an event-aware dynamic moment query to enable the model to take
+the input-specific content and positional information of the video into
+account. To this end, we present two levels of reasoning: 1) Event reasoning
+that captures distinctive event units constituting a given video using a slot
+attention mechanism; and 2) moment reasoning that fuses the moment queries with
+a given sentence through a gated fusion transformer layer and learns
+interactions between the moment queries and video-sentence representations to
+predict moment timestamps. Extensive experiments demonstrate the effectiveness
+and efficiency of the event-aware dynamic moment queries, outperforming
+state-of-the-art approaches on several video grounding benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-aware Network for Aerial-to-Ground Image Synthesis <span class="chip">ICIP 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhyun Jang, Taeyong Song, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aerial-to-ground image synthesis is an emerging and challenging problem that
+aims to synthesize a ground image from an aerial image. Due to the highly
+different layout and object representation between the aerial and ground
+images, existing approaches usually fail to transfer the components of the
+aerial scene into the ground scene. In this paper, we propose a novel framework
+to explore the challenges by imposing enhanced structural alignment and
+semantic awareness. We introduce a novel semantic-attentive feature
+transformation module that allows to reconstruct the complex geographic
+structures by aligning the aerial feature to the ground layout. Furthermore, we
+propose semantic-aware loss functions by leveraging a pre-trained segmentation
+network. The network is enforced to synthesize realistic objects across various
+classes by separately calculating losses for different classes and balancing
+them. Extensive experiments including comparisons with previous methods and
+ablation studies show the effectiveness of the proposed framework both
+qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP 2021. Code is available at https://github.com/jinhyunj/SANet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Insurance pricing on price comparison websites via reinforcement
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanut Treetanthiploet, Yufei Zhang, Lukasz Szpruch, Isaac Bowers-Barnard, Henrietta Ridley, James Hickey, Chris Pearce
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of price comparison websites (PCWs) has presented insurers with
+unique challenges in formulating effective pricing strategies. Operating on
+PCWs requires insurers to strike a delicate balance between competitive
+premiums and profitability, amidst obstacles such as low historical conversion
+rates, limited visibility of competitors' actions, and a dynamic market
+environment. In addition to this, the capital intensive nature of the business
+means pricing below the risk levels of customers can result in solvency issues
+for the insurer. To address these challenges, this paper introduces
+reinforcement learning (RL) framework that learns the optimal pricing policy by
+integrating model-based and model-free methods. The model-based component is
+used to train agents in an offline setting, avoiding cold-start issues, while
+model-free algorithms are then employed in a contextual bandit (CB) manner to
+dynamically update the pricing policy to maximise the expected revenue. This
+facilitates quick adaptation to evolving market dynamics and enhances algorithm
+efficiency and decision interpretability. The paper also highlights the
+importance of evaluating pricing policies using an offline dataset in a
+consistent fashion and demonstrates the superiority of the proposed methodology
+over existing off-the-shelf RL/CB approaches. We validate our methodology using
+synthetic data, generated to reflect private commercially available data within
+real-world insurers, and compare against 6 other benchmark approaches. Our
+hybrid agent outperforms these benchmarks in terms of sample efficiency and
+cumulative reward with the exception of an agent that has access to perfect
+market information which would not be available in a real-world set-up.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Listing Prices In Dynamic Short Term Rental Markets Using
+  Machine Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sam Chapman, Seifey Mohammad, Kimberly Villegas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our research group wanted to take on the difficult task of predicting prices
+in a dynamic market. And short term rentals such as Airbnb listings seemed to
+be the perfect proving ground to do such a thing. Airbnb has revolutionized the
+travel industry by providing a platform for homeowners to rent out their
+properties to travelers. The pricing of Airbnb rentals is prone to high
+fluctuations, with prices changing frequently based on demand, seasonality, and
+other factors. Accurate prediction of Airbnb rental prices is crucial for hosts
+to optimize their revenue and for travelers to make informed booking decisions.
+In this project, we aim to predict the prices of Airbnb rentals using a machine
+learning modeling approach.
+  Our project expands on earlier research in the area of analyzing Airbnb
+rental prices by taking a methodical machine learning approach as well as
+incorporating sentiment analysis into our feature engineering. We intend to
+gain a deeper understanding on periodic changes of Airbnb rental prices. The
+primary objective of this study is to construct an accurate machine learning
+model for predicting Airbnb rental prices specifically in Austin, Texas. Our
+project's secondary objective is to identify the key factors that drive Airbnb
+rental prices and to investigate how these factors vary across different
+locations and property types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 10 tables, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBA: Improving Online Continual Learning via Continual Bias Adaptor <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanziang Wang, Renzhen Wang, Yichen Wu, Xixi Jia, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online continual learning (CL) aims to learn new knowledge and consolidate
+previously learned knowledge from non-stationary data streams. Due to the
+time-varying training setting, the model learned from a changing distribution
+easily forgets the previously learned knowledge and biases toward the newly
+received task. To address this problem, we propose a Continual Bias Adaptor
+(CBA) module to augment the classifier network to adapt to catastrophic
+distribution change during training, such that the classifier network is able
+to learn a stable consolidation of previously learned tasks. In the testing
+stage, CBA can be removed which introduces no additional computation cost and
+memory overhead. We theoretically reveal the reason why the proposed method can
+effectively alleviate catastrophic distribution shifts, and empirically
+demonstrate its effectiveness through extensive experiments based on four
+rehearsal-based baselines and three public continual learning benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CausalLM is not optimal for in-context learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Ding, Tomer Levinboim, Jialin Wu, Sebastian Goodman, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent empirical evidence indicates that transformer based in-context
+learning performs better when using a prefix language model (prefixLM), in
+which in-context samples can all attend to each other, compared to causal
+language models (causalLM), which use auto-regressive attention that prohibits
+in-context samples to attend to future samples. While this result is intuitive,
+it is not understood from a theoretical perspective. In this paper we take a
+theoretical approach and analyze the convergence behavior of prefixLM and
+causalLM under a certain parameter construction. Our analysis shows that both
+LM types converge to their stationary points at a linear rate, but that while
+prefixLM converges to the optimal solution of linear regression, causalLM
+convergence dynamics follows that of an online gradient descent algorithm,
+which is not guaranteed to be optimal even as the number of samples grows
+infinitely. We supplement our theoretical claims with empirical experiments
+over synthetic and real tasks and using various types of transformers. Our
+experiments verify that causalLM consistently underperforms prefixLM in all
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with
+  Graph, Image, and Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Liu, Yiming Ren, Zhixiang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have made significant strides in natural language
+processing, paving the way for innovative applications including molecular
+representation and generation. However, most existing single-modality
+approaches cannot capture the abundant and complex information in molecular
+data. Here, we introduce GIT-Mol, a multi-modal large language model that
+integrates the structure Graph, Image, and Text information, including the
+Simplified Molecular Input Line Entry System (SMILES) and molecular captions.
+To facilitate the integration of multi-modal molecular data, we propose
+GIT-Former, a novel architecture capable of mapping all modalities into a
+unified latent space. Our study develops an innovative any-to-language
+molecular translation strategy and achieves a 10%-15% improvement in molecular
+captioning, a 5%-10% accuracy increase in property prediction, and a 20% boost
+in molecule generation validity compared to baseline or single-modality models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonathan A. Arbel, David Hoffman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce generative interpretation, a new approach to estimating
+contractual meaning using large language models. As AI triumphalism is the
+order of the day, we proceed by way of grounded case studies, each illustrating
+the capabilities of these novel tools in distinct ways. Taking well-known
+contracts opinions, and sourcing the actual agreements that they adjudicated,
+we show that AI models can help factfinders ascertain ordinary meaning in
+context, quantify ambiguity, and fill gaps in parties' agreements. We also
+illustrate how models can calculate the probative value of individual pieces of
+extrinsic evidence. After offering best practices for the use of these models
+given their limitations, we consider their implications for judicial practice
+and contract theory. Using LLMs permits courts to estimate what the parties
+intended cheaply and accurately, and as such generative interpretation
+unsettles the current interpretative stalemate. Their use responds to
+efficiency-minded textualists and justice-oriented contextualists, who argue
+about whether parties will prefer cost and certainty or accuracy and fairness.
+Parties--and courts--would prefer a middle path, in which adjudicators strive
+to predict what the contract really meant, admitting just enough context to
+approximate reality while avoiding unguided and biased assimilation of
+evidence. As generative interpretation offers this possibility, we argue it can
+become the new workhorse of contractual interpretation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Classification in Hyperbolic Spaces via Secure Aggregation of
+  Convex Hulls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurav Prakash, Jin Sima, Chao Pan, Eli Chien, Olgica Milenkovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hierarchical and tree-like data sets arise in many applications, including
+language processing, graph data mining, phylogeny and genomics. It is known
+that tree-like data cannot be embedded into Euclidean spaces of finite
+dimension with small distortion. This problem can be mitigated through the use
+of hyperbolic spaces. When such data also has to be processed in a distributed
+and privatized setting, it becomes necessary to work with new federated
+learning methods tailored to hyperbolic spaces. As an initial step towards the
+development of the field of federated learning in hyperbolic spaces, we propose
+the first known approach to federated classification in hyperbolic spaces. Our
+contributions are as follows. First, we develop distributed versions of convex
+SVM classifiers for Poincar\'e discs. In this setting, the information conveyed
+from clients to the global classifier are convex hulls of clusters present in
+individual client data. Second, to avoid label switching issues, we introduce a
+number-theoretic approach for label recovery based on the so-called integer
+$B_h$ sequences. Third, we compute the complexity of the convex hulls in
+hyperbolic spaces to assess the extent of data leakage; at the same time, in
+order to limit the communication cost for the hulls, we propose a new
+quantization method for the Poincar\'e disc coupled with Reed-Solomon-like
+encoding. Fourth, at server level, we introduce a new approach for aggregating
+convex hulls of the clients based on balanced graph partitioning. We test our
+method on a collection of diverse data sets, including hierarchical single-cell
+RNA-seq data from different patients distributed across different repositories
+that have stringent privacy constraints. The classification accuracy of our
+method is up to $\sim 11\%$ better than its Euclidean counterpart,
+demonstrating the importance of privacy-preserving learning in hyperbolic
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Offline-Online Evaluation with a Time-dependent and Popularity
+  Bias-free Offline Metric for Recommenders <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petr Kasalický, Rodrigo Alves, Pavel Kordík
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of recommendation systems is a complex task. The offline and
+online evaluation metrics for recommender systems are ambiguous in their true
+objectives. The majority of recently published papers benchmark their methods
+using ill-posed offline evaluation methodology that often fails to predict true
+online performance. Because of this, the impact that academic research has on
+the industry is reduced. The aim of our research is to investigate and compare
+the online performance of offline evaluation metrics. We show that penalizing
+popular items and considering the time of transactions during the evaluation
+significantly improves our ability to choose the best recommendation model for
+a live recommender system. Our results, averaged over five large-size
+real-world live data procured from recommenders, aim to help the academic
+community to understand better offline evaluation and optimization criteria
+that are more relevant for real applications of recommender systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to evalRS 2023@KDD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Receiver Task-Oriented Communications via Multi-Task Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yalin E. Sagduyu, Tugba Erpek, Aylin Yener, Sennur Ulukus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies task-oriented, otherwise known as goal-oriented,
+communications, in a setting where a transmitter communicates with multiple
+receivers, each with its own task to complete on a dataset, e.g., images,
+available at the transmitter. A multi-task deep learning approach that involves
+training a common encoder at the transmitter and individual decoders at the
+receivers is presented for joint optimization of completing multiple tasks and
+communicating with multiple receivers. By providing efficient resource
+allocation at the edge of 6G networks, the proposed approach allows the
+communications system to adapt to varying channel conditions and achieves
+task-specific objectives while minimizing transmission overhead. Joint training
+of the encoder and decoders using multi-task learning captures shared
+information across tasks and optimizes the communication process accordingly.
+By leveraging the broadcast nature of wireless communications, multi-receiver
+task-oriented communications (MTOC) reduces the number of transmissions
+required to complete tasks at different receivers. Performance evaluation
+conducted on the MNIST, Fashion MNIST, and CIFAR-10 datasets (with image
+classification considered for different tasks) demonstrates the effectiveness
+of MTOC in terms of classification accuracy and resource utilization compared
+to single-task-oriented communication systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Outlierness of Funds from their Categories using Supervised
+  Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Desai, Ashmita Dhiman, Tushar Sharma, Deepika Sharma, Dhagash Mehta, Stefano Pasquali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mutual fund categorization has become a standard tool for the investment
+management industry and is extensively used by allocators for portfolio
+construction and manager selection, as well as by fund managers for peer
+analysis and competitive positioning. As a result, a (unintended)
+miscategorization or lack of precision can significantly impact allocation
+decisions and investment fund managers. Here, we aim to quantify the effect of
+miscategorization of funds utilizing a machine learning based approach. We
+formulate the problem of miscategorization of funds as a distance-based outlier
+detection problem, where the outliers are the data-points that are far from the
+rest of the data-points in the given feature space. We implement and employ a
+Random Forest (RF) based method of distance metric learning, and compute the
+so-called class-wise outlier measures for each data-point to identify outliers
+in the data. We test our implementation on various publicly available data
+sets, and then apply it to mutual fund data. We show that there is a strong
+relationship between the outlier measures of the funds and their future returns
+and discuss the implications of our findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 tables, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoSeqRec: Autoencoder for Efficient Sequential Recommendation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijia Liu, Jiahao Liu, Hansu Gu, Dongsheng Li, Tun Lu, Peng Zhang, Ning Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation demonstrates the capability to recommend items by
+modeling the sequential behavior of users. Traditional methods typically treat
+users as sequences of items, overlooking the collaborative relationships among
+them. Graph-based methods incorporate collaborative information by utilizing
+the user-item interaction graph. However, these methods sometimes face
+challenges in terms of time complexity and computational efficiency. To address
+these limitations, this paper presents AutoSeqRec, an incremental
+recommendation model specifically designed for sequential recommendation tasks.
+AutoSeqRec is based on autoencoders and consists of an encoder and three
+decoders within the autoencoder architecture. These components consider both
+the user-item interaction matrix and the rows and columns of the item
+transition matrix. The reconstruction of the user-item interaction matrix
+captures user long-term preferences through collaborative filtering. In
+addition, the rows and columns of the item transition matrix represent the item
+out-degree and in-degree hopping behavior, which allows for modeling the user's
+short-term interests. When making incremental recommendations, only the input
+matrices need to be updated, without the need to update parameters, which makes
+AutoSeqRec very efficient. Comprehensive evaluations demonstrate that
+AutoSeqRec outperforms existing methods in terms of accuracy, while showcasing
+its robustness and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechX: Neural Codec Language Model as a Versatile Speech <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofei Wang, Manthan Thakker, Zhuo Chen, Naoyuki Kanda, Sefik Emre Eskimez, Sanyuan Chen, Min Tang, Shujie Liu, Jinyu Li, Takuya Yoshioka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in generative speech models based on audio-text prompts
+have enabled remarkable innovations like high-quality zero-shot text-to-speech.
+However, existing models still face limitations in handling diverse audio-text
+speech generation tasks involving transforming input speech and processing
+audio captured in adverse acoustic conditions. This paper introduces SpeechX, a
+versatile speech generation model capable of zero-shot TTS and various speech
+transformation tasks, dealing with both clean and noisy signals. SpeechX
+combines neural codec language modeling with multi-task learning using
+task-dependent prompting, enabling unified and extensible modeling and
+providing a consistent way for leveraging textual input in speech enhancement
+and transformation tasks. Experimental results show SpeechX's efficacy in
+various tasks, including zero-shot TTS, noise suppression, target speaker
+extraction, speech removal, and speech editing with or without background
+noise, achieving comparable or superior performance to specialized models
+across tasks. See https://aka.ms/speechx for demo samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See https://aka.ms/speechx for demo samples</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ G-MATT: Single-step Retrosynthesis Prediction using Molecular Grammar
+  Tree <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Zhang, Vipul Mann, Venkat Venkatasubramanian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various template-based and template-free approaches have been proposed for
+single-step retrosynthesis prediction in recent years. While these approaches
+demonstrate strong performance from a data-driven metrics standpoint, many
+model architectures do not incorporate underlying chemistry principles. Here,
+we propose a novel chemistry-aware retrosynthesis prediction framework that
+combines powerful data-driven models with prior domain knowledge. We present a
+tree-to-sequence transformer architecture that utilizes hierarchical SMILES
+grammar-based trees, incorporating crucial chemistry information that is often
+overlooked by SMILES text-based representations, such as local structures and
+functional groups. The proposed framework, grammar-based molecular attention
+tree transformer (G-MATT), achieves significant performance improvements
+compared to baseline retrosynthesis models. G-MATT achieves a promising top-1
+accuracy of 51% (top-10 accuracy of 79.1%), invalid rate of 1.5%, and bioactive
+similarity rate of 74.8% on the USPTO- 50K dataset. Additional analyses of
+G-MATT attention maps demonstrate the ability to retain chemistry knowledge
+without relying on excessively complex model architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Sublinear Regret of GP-UCB 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Whitehouse, Zhiwei Steven Wu, Aaditya Ramdas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the kernelized bandit problem, a learner aims to sequentially compute the
+optimum of a function lying in a reproducing kernel Hilbert space given only
+noisy evaluations at sequentially chosen points. In particular, the learner
+aims to minimize regret, which is a measure of the suboptimality of the choices
+made. Arguably the most popular algorithm is the Gaussian Process Upper
+Confidence Bound (GP-UCB) algorithm, which involves acting based on a simple
+linear estimator of the unknown function. Despite its popularity, existing
+analyses of GP-UCB give a suboptimal regret rate, which fails to be sublinear
+for many commonly used kernels such as the Mat\'ern kernel. This has led to a
+longstanding open question: are existing regret analyses for GP-UCB tight, or
+can bounds be improved by using more sophisticated analytical techniques? In
+this work, we resolve this open question and show that GP-UCB enjoys nearly
+optimal regret. In particular, our results yield sublinear regret rates for the
+Mat\'ern kernel, improving over the state-of-the-art analyses and partially
+resolving a COLT open problem posed by Vakili et al. Our improvements rely on a
+key technical contribution -- regularizing kernel ridge estimators in
+proportion to the smoothness of the underlying kernel $k$. Applying this key
+idea together with a largely overlooked concentration result in separable
+Hilbert spaces (for which we provide an independent, simplified derivation), we
+are able to provide a tighter analysis of the GP-UCB algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 0 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Impact of Social Determinants on Health Prediction in the
+  Intensive Care Unit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12622v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12622v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Ying Yang, Gloria Hyunjung Kwak, Tom Pollard, Leo Anthony Celi, Marzyeh Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social determinants of health (SDOH) -- the conditions in which people live,
+grow, and age -- play a crucial role in a person's health and well-being. There
+is a large, compelling body of evidence in population health studies showing
+that a wide range of SDOH is strongly correlated with health outcomes. Yet, a
+majority of the risk prediction models based on electronic health records (EHR)
+do not incorporate a comprehensive set of SDOH features as they are often noisy
+or simply unavailable. Our work links a publicly available EHR database,
+MIMIC-IV, to well-documented SDOH features. We investigate the impact of such
+features on common EHR prediction tasks across different patient populations.
+We find that community-level SDOH features do not improve model performance for
+a general patient population, but can improve data-limited model fairness for
+specific subpopulations. We also demonstrate that SDOH features are vital for
+conducting thorough audits of algorithmic biases beyond protective attributes.
+We hope the new integrated EHR-SDOH database will enable studies on the
+relationship between community health and individual outcomes and provide new
+benchmarks to study algorithmic biases beyond race, gender, and age.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Source-free Domain Adaptive Human Pose Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qucheng Peng, Ce Zheng, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Pose Estimation (HPE) is widely used in various fields, including
+motion analysis, healthcare, and virtual reality. However, the great expenses
+of labeled real-world datasets present a significant challenge for HPE. To
+overcome this, one approach is to train HPE models on synthetic datasets and
+then perform domain adaptation (DA) on real-world data. Unfortunately, existing
+DA methods for HPE neglect data privacy and security by using both source and
+target data in the adaptation process. To this end, we propose a new task,
+named source-free domain adaptive HPE, which aims to address the challenges of
+cross-domain learning of HPE without access to source data during the
+adaptation process. We further propose a novel framework that consists of three
+models: source model, intermediate model, and target model, which explores the
+task from both source-protect and target-relevant perspectives. The
+source-protect module preserves source information more effectively while
+resisting noise, and the target-relevant module reduces the sparsity of spatial
+representations by building a novel spatial probability space, and
+pose-specific contrastive learning and information maximization are proposed on
+the basis of this space. Comprehensive experiments on several domain adaptive
+HPE benchmarks show that the proposed method outperforms existing approaches by
+a considerable margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Future of Fundamental Science Led by Generative Closed-Loop
+  Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hector Zenil, Jesper Tegnér, Felipe S. Abrahão, Alexander Lavin, Vipin Kumar, Jeremy G. Frey, Adrian Weller, Larisa Soldatova, Alan R. Bundy, Nicholas R. Jennings, Koichi Takahashi, Lawrence Hunter, Saso Dzeroski, Andrew Briggs, Frederick D. Gregory, Carla P. Gomes, Christopher K. I. Williams, Jon Rowe, James Evans, Hiroaki Kitano, Joshua B. Tenenbaum, Ross King
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in machine learning and AI, including Generative AI and LLMs,
+are disrupting technological innovation, product development, and society as a
+whole. AI's contribution to technology can come from multiple approaches that
+require access to large training data sets and clear performance evaluation
+criteria, ranging from pattern recognition and classification to generative
+models. Yet, AI has contributed less to fundamental science in part because
+large data sets of high-quality data for scientific practice and model
+discovery are more difficult to access. Generative AI, in general, and Large
+Language Models in particular, may represent an opportunity to augment and
+accelerate the scientific discovery of fundamental deep science with
+quantitative models. Here we explore and investigate aspects of an AI-driven,
+automated, closed-loop approach to scientific discovery, including self-driven
+hypothesis generation and open-ended autonomous exploration of the hypothesis
+space. Integrating AI-driven automation into the practice of science would
+mitigate current problems, including the replication of findings, systematic
+production of data, and ultimately democratisation of the scientific process.
+Realising these possibilities requires a vision for augmented AI coupled with a
+diversity of AI approaches able to deal with fundamental aspects of causality
+analysis and model discovery while enabling unbiased search across the space of
+putative explanations. These advances hold the promise to unleash AI's
+potential for searching and discovering the fundamental structure of our world
+beyond what human scientists have been able to achieve. Such a vision would
+push the boundaries of new fundamental science rather than automatize current
+workflows and instead open doors for technological innovation to tackle some of
+the greatest challenges facing humanity today.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, first draft of the final report from the Alan Turing
+  Institute on AI for Scientific Discovery</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deconfounded Causal Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.07122v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.07122v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Xu, Juntao Tan, Shelby Heinecke, Jia Li, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems may be confounded by various types of confounding factors
+(also called confounders) that may lead to inaccurate recommendations and
+sacrificed recommendation performance. Current approaches to solving the
+problem usually design each specific model for each specific confounder.
+However, real-world systems may include a huge number of confounders and thus
+designing each specific model for each specific confounder could be
+unrealistic. More importantly, except for those ``explicit confounders'' that
+experts can manually identify and process such as item's position in the
+ranking list, there are also many ``latent confounders'' that are beyond the
+imagination of experts. For example, users' rating on a song may depend on
+their current mood or the current weather, and users' preference on ice creams
+may depend on the air temperature. Such latent confounders may be unobservable
+in the recorded training data. To solve the problem, we propose Deconfounded
+Causal Collaborative Filtering (DCCF). We first frame user behaviors with
+unobserved confounders into a causal graph, and then we design a front-door
+adjustment model carefully fused with machine learning to deconfound the
+influence of unobserved confounders. Experiments on real-world datasets show
+that our method is able to deconfound unobserved confounders to achieve better
+recommendation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM Transactions on Recommender Systems (TORS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Collaborative Filtering <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2102.01868v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2102.01868v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Xu, Yingqiang Ge, Yunqi Li, Zuohui Fu, Xu Chen, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many of the traditional recommendation algorithms are designed based on the
+fundamental idea of mining or learning correlative patterns from data to
+estimate the user-item correlative preference. However, pure correlative
+learning may lead to Simpson's paradox in predictions, and thus results in
+sacrificed recommendation performance. Simpson's paradox is a well-known
+statistical phenomenon, which causes confusions in statistical conclusions and
+ignoring the paradox may result in inaccurate decisions. Fortunately, causal
+and counterfactual modeling can help us to think outside of the observational
+data for user modeling and personalization so as to tackle such issues. In this
+paper, we propose Causal Collaborative Filtering (CCF) -- a general framework
+for modeling causality in collaborative filtering and recommendation. We
+provide a unified causal view of CF and mathematically show that many of the
+traditional CF algorithms are actually special cases of CCF under simplified
+causal graphs. We then propose a conditional intervention approach for
+$do$-operations so that we can estimate the user-item causal preference based
+on the observational data. Finally, we further propose a general counterfactual
+constrained learning framework for estimating the user-item preferences.
+Experiments are conducted on two types of real-world datasets -- traditional
+and randomized trial data -- and results show that our framework can improve
+the recommendation performance and reduce the Simpson's paradox problem of many
+CF algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 2023 ACM SIGIR International Conference on Theory of
+  Information Retrieval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Asymptotic Pointwise and Worst-Case Bounds for Classical Spectrum
+  Estimators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11908v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11908v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Lamperski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectrum estimation is a fundamental methodology in the analysis of
+time-series data, with applications including medicine, speech analysis, and
+control design. The asymptotic theory of spectrum estimation is
+well-understood, but the theory is limited when the number of samples is fixed
+and finite. This paper gives non-asymptotic error bounds for a broad class of
+spectral estimators, both pointwise (at specific frequencies) and in the worst
+case over all frequencies. The general method is used to derive error bounds
+for the classical Blackman-Tukey, Bartlett, and Welch estimators. In
+particular, these are first non-asymptotic error bounds for Bartlett and Welch
+estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, under review in IEEE Transactions on Signal
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Machine Learning meets with Equity in Healthcare <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Parisa Osivand Pour, Syed Raza Bashir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing utilization of machine learning in healthcare, there is
+increasing potential to enhance healthcare outcomes. However, this also brings
+the risk of perpetuating biases in data and model design that can harm certain
+demographic groups based on factors such as age, gender, and race. This study
+proposes an artificial intelligence framework, grounded in software engineering
+principles, for identifying and mitigating biases in data and models while
+ensuring fairness in healthcare settings. A case study is presented to
+demonstrate how systematic biases in data can lead to amplified biases in model
+predictions, and machine learning methods are suggested to prevent such biases.
+Future research aims to test and validate the proposed ML framework in
+real-world clinical settings to evaluate its impact on promoting health equity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Association for the Advancement of Artificial
+  Intelligence (AAAI) 2023 , Responsible Medical AI, Design, and
+  Operationalization Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Rate Optimal Regret for Adversarial Contextual MDPs Using
+  Online Function Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Orin Levy, Alon Cohen, Asaf Cassel, Yishay Mansour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the OMG-CMDP! algorithm for regret minimization in adversarial
+Contextual MDPs. The algorithm operates under the minimal assumptions of
+realizable function class and access to online least squares and log loss
+regression oracles. Our algorithm is efficient (assuming efficient online
+regression oracles), simple and robust to approximation errors. It enjoys an
+$\widetilde{O}(H^{2.5} \sqrt{ T|S||A| ( \mathcal{R}(\mathcal{O}) + H
+\log(\delta^{-1}) )})$ regret guarantee, with $T$ being the number of episodes,
+$S$ the state space, $A$ the action space, $H$ the horizon and
+$\mathcal{R}(\mathcal{O}) = \mathcal{R}(\mathcal{O}_{\mathrm{sq}}^\mathcal{F})
++ \mathcal{R}(\mathcal{O}_{\mathrm{log}}^\mathcal{P})$ is the sum of the
+regression oracles' regret, used to approximate the context-dependent rewards
+and dynamics, respectively. To the best of our knowledge, our algorithm is the
+first efficient rate optimal regret minimization algorithm for adversarial
+CMDPs that operates under the minimal standard assumption of online function
+approximation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid quantum-classical machine learning for generative chemistry and
+  drug design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.11644v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.11644v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. I. Gircha, A. S. Boev, K. Avchaciov, P. O. Fedichev, A. K. Fedorov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative chemistry models emerge as powerful tools to expedite drug
+discovery. However, the immense size and complexity of the structural space of
+all possible drug-like molecules pose significant obstacles, which could be
+overcome with hybrid architectures combining quantum computers with deep
+classical networks. As the first step toward this goal, we built a compact
+discrete variational autoencoder (DVAE) with a Restricted Boltzmann Machine
+(RBM) of reduced size in its latent layer. The size of the proposed model was
+small enough to fit on a state-of-the-art D-Wave quantum annealer and allowed
+training on a subset of the ChEMBL dataset of biologically active compounds.
+Finally, we generated 2331 novel chemical structures with medicinal chemistry
+and synthetic accessibility properties in the ranges typical for molecules from
+ChEMBL. The presented results demonstrate the feasibility of using already
+existing or soon-to-be-available quantum computing devices as testbeds for
+future drug discovery applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages. 3 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADiff: Offline Multi-agent Learning with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengbang Zhu, Minghuan Liu, Liyuan Mao, Bingyi Kang, Minkai Xu, Yong Yu, Stefano Ermon, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion model (DM), as a powerful generative model, recently achieved huge
+success in various scenarios including offline reinforcement learning, where
+the policy learns to conduct planning by generating trajectory in the online
+evaluation. However, despite the effectiveness shown for single-agent learning,
+it remains unclear how DMs can operate in multi-agent problems, where agents
+can hardly complete teamwork without good coordination by independently
+modeling each agent's trajectories. In this paper, we propose MADiff, a novel
+generative multi-agent learning framework to tackle this problem. MADiff is
+realized with an attention-based diffusion model to model the complex
+coordination among behaviors of multiple diffusion agents. To the best of our
+knowledge, MADiff is the first diffusion-based multi-agent offline RL
+framework, which behaves as both a decentralized policy and a centralized
+controller, which includes opponent modeling and can be used for multi-agent
+trajectory prediction. MADiff takes advantage of the powerful generative
+ability of diffusion while well-suited in modeling complex multi-agent
+interactions. Our experiments show the superior performance of MADiff compared
+to baseline algorithms in a range of multi-agent learning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, 4 tables. The first two authors contributed
+  equally to the work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Counterfactual Safety Margin Perspective on the Scoring of Autonomous
+  Vehicles' Riskiness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Zanardi, Andrea Censi, Margherita Atzei, Luigi Di Lillo, Emilio Frazzoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous Vehicles (AVs) have the potential to provide numerous societal
+benefits, such as decreased road accidents and increased overall transportation
+efficiency. However, quantifying the risk associated with AVs is challenging
+due to the lack of historical data and the rapidly evolving technology. This
+paper presents a data-driven framework for comparing the risk of different AVs'
+behaviors in various operational design domains (ODDs), based on counterfactual
+simulations of "misbehaving" road users. We introduce the concept of
+counterfactual safety margin, which represents the minimum deviation from
+normal behavior that could lead to a collision. This concept helps to find the
+most critical scenarios but also to assess the frequency and severity of risk
+of AVs. We show that the proposed methodology is applicable even when the AV's
+behavioral policy is unknown -- through worst- and best-case analyses -- making
+the method useful also to external third-party risk assessors. Our experimental
+results demonstrate the correlation between the safety margin, the driving
+policy quality, and the ODD shedding light on the relative risk associated with
+different AV providers. This work contributes to AV safety assessment and aids
+in addressing legislative and insurance concerns surrounding this emerging
+technology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated affiliations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Transfer Noise Patterns? A Multi-environment Spectrum Analysis
+  Model Using Generated Cases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiwen Du, Zheng Ju, Yu An, Honghui Du, Dongjie Zhu, Zhaoshuo Tian, Aonghus Lawlor, Ruihai Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectrum analysis systems in online water quality testing are designed to
+detect types and concentrations of pollutants and enable regulatory agencies to
+respond promptly to pollution incidents. However, spectral data-based testing
+devices suffer from complex noise patterns when deployed in non-laboratory
+environments. To make the analysis model applicable to more environments, we
+propose a noise patterns transferring model, which takes the spectrum of
+standard water samples in different environments as cases and learns the
+differences in their noise patterns, thus enabling noise patterns to transfer
+to unknown samples. Unfortunately, the inevitable sample-level baseline noise
+makes the model unable to obtain the paired data that only differ in
+dataset-level environmental noise. To address the problem, we generate a
+sample-to-sample case-base to exclude the interference of sample-level noise on
+dataset-level noise learning, enhancing the system's learning performance.
+Experiments on spectral data with different background noises demonstrate the
+good noise-transferring ability of the proposed method against baseline systems
+ranging from wavelet denoising, deep neural networks, and generative models.
+From this research, we posit that our method can enhance the performance of DL
+models by generating high-quality cases. The source code is made publicly
+available online at https://github.com/Magnomic/CNST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Automated Algorithm Configuration for Parameter Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deyao Chen, Maxim Buzdalov, Carola Doerr, Nguyen Dang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic Algorithm Configuration (DAC) tackles the question of how to
+automatically learn policies to control parameters of algorithms in a
+data-driven fashion. This question has received considerable attention from the
+evolutionary community in recent years. Having a good benchmark collection to
+gain structural understanding on the effectiveness and limitations of different
+solution methods for DAC is therefore strongly desirable. Following recent work
+on proposing DAC benchmarks with well-understood theoretical properties and
+ground truth information, in this work, we suggest as a new DAC benchmark the
+controlling of the key parameter $\lambda$ in the
+$(1+(\lambda,\lambda))$~Genetic Algorithm for solving OneMax problems. We
+conduct a study on how to solve the DAC problem via the use of (static)
+automated algorithm configuration on the benchmark, and propose techniques to
+significantly improve the performance of the approach. Our approach is able to
+consistently outperform the default parameter control policy of the benchmark
+derived from previous theoretical work on sufficiently large problem sizes. We
+also present new findings on the landscape of the parameter-control search
+policies and propose methods to compute stronger baselines for the benchmark
+via numerical approximations of the true optimal policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the Proc. of the ACM/SIGEVO Conference on Foundations of
+  Genetic Algorithms (FOGA XVII)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through
+  Unexploitable Data with Learnable Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09241v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09241v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wan Jiang, Yunfeng Diao, He Wang, Jianxin Sun, Meng Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safeguarding data from unauthorized exploitation is vital for privacy and
+security, especially in recent rampant research in security breach such as
+adversarial/membership attacks. To this end, \textit{unlearnable examples}
+(UEs) have been recently proposed as a compelling protection, by adding
+imperceptible perturbation to data so that models trained on them cannot
+classify them accurately on original clean distribution. Unfortunately, we find
+UEs provide a false sense of security, because they cannot stop unauthorized
+users from utilizing other unprotected data to remove the protection, by
+turning unlearnable data into learnable again. Motivated by this observation,
+we formally define a new threat by introducing \textit{learnable unauthorized
+examples} (LEs) which are UEs with their protection removed. The core of this
+approach is a novel purification process that projects UEs onto the manifold of
+LEs. This is realized by a new joint-conditional diffusion model which denoises
+UEs conditioned on the pixel and perceptual similarity between UEs and LEs.
+Extensive experiments demonstrate that LE delivers state-of-the-art countering
+performance against both supervised UEs and unsupervised UEs in various
+scenarios, which is the first generalizable countermeasure to UEs across
+supervised learning and unsupervised learning. Our code is available at
+\url{https://github.com/jiangw-0/LE_JCDP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12398v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12398v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anahita Nekoozadeh, Mohammad Reza Ahmadzadeh, Zahra Mardani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have achieved widespread success in computer vision. At their
+heart, there is a Self-Attention (SA) mechanism, an inductive bias that
+associates each token in the input with every other token through a weighted
+basis. The standard SA mechanism has quadratic complexity with the sequence
+length, which impedes its utility to long sequences appearing in high
+resolution vision. Recently, inspired by operator learning for PDEs, Adaptive
+Fourier Neural Operators (AFNO) were introduced for high resolution attention
+based on global convolution that is efficiently implemented via FFT. However,
+the AFNO global filtering cannot well represent small and moderate scale
+structures that commonly appear in natural images. To leverage the
+coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention
+(MWA) by leveraging wavelet neural operators which incurs linear complexity in
+the sequence size. We replace the attention in ViT with MWA and our experiments
+with CIFAR and Tiny-ImageNet classification demonstrate significant improvement
+over alternative Fourier-based attentions such as AFNO and Global Filter
+Network (GFN).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraged Matrix Completion with Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.05885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.05885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjian Huang, Weiwei Liu, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Completing low-rank matrices from subsampled measurements has received much
+attention in the past decade. Existing works indicate that
+$\mathcal{O}(nr\log^2(n))$ datums are required to theoretically secure the
+completion of an $n \times n$ noisy matrix of rank $r$ with high probability,
+under some quite restrictive assumptions: (1) the underlying matrix must be
+incoherent; (2) observations follow the uniform distribution. The
+restrictiveness is partially due to ignoring the roles of the leverage score
+and the oracle information of each element. In this paper, we employ the
+leverage scores to characterize the importance of each element and
+significantly relax assumptions to: (1) not any other structure assumptions are
+imposed on the underlying low-rank matrix; (2) elements being observed are
+appropriately dependent on their importance via the leverage score. Under these
+assumptions, instead of uniform sampling, we devise an ununiform/biased
+sampling procedure that can reveal the ``importance'' of each observed element.
+Our proofs are supported by a novel approach that phrases sufficient optimality
+conditions based on the Golfing Scheme, which would be of independent interest
+to the wider areas. Theoretical findings show that we can provably recover an
+unknown $n\times n$ matrix of rank $r$ from just about $\mathcal{O}(nr\log^2
+(n))$ entries, even when the observed entries are corrupted with a small amount
+of noisy information. The empirical results align precisely with our theories.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript has been accepted for publication as a regular paper
+  in the IEEE Transactions on Cybernetics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Miipher: A Robust Speech Restoration Model Integrating <span class="highlight-title">Self-Supervised</span>
+  Speech and Text Representations <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01664v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01664v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuma Koizumi, Heiga Zen, Shigeki Karita, Yifan Ding, Kohei Yatabe, Nobuyuki Morioka, Yu Zhang, Wei Han, Ankur Bapna, Michiel Bacchiani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech restoration (SR) is a task of converting degraded speech signals into
+high-quality ones. In this study, we propose a robust SR model called Miipher,
+and apply Miipher to a new SR application: increasing the amount of
+high-quality training data for speech generation by converting speech samples
+collected from the Web to studio-quality. To make our SR model robust against
+various degradation, we use (i) a speech representation extracted from w2v-BERT
+for the input feature, and (ii) a text representation extracted from
+transcripts via PnG-BERT as a linguistic conditioning feature. Experiments show
+that Miipher (i) is robust against various audio degradation and (ii) enable us
+to train a high-quality text-to-speech (TTS) model from restored speech samples
+collected from the Web. Audio samples are available at our demo page:
+google.github.io/df-conformer/miipher/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WASPAA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expediting Neural Network Verification via Network Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyi Zhong, Ruiwei Wang, Siau-Cheng Khoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide range of verification methods have been proposed to verify the safety
+properties of deep neural networks ensuring that the networks function
+correctly in critical applications. However, many well-known verification tools
+still struggle with complicated network architectures and large network sizes.
+In this work, we propose a network reduction technique as a pre-processing
+method prior to verification. The proposed method reduces neural networks via
+eliminating stable ReLU neurons, and transforming them into a sequential neural
+network consisting of ReLU and Affine layers which can be handled by the most
+verification tools. We instantiate the reduction technique on the
+state-of-the-art complete and incomplete verification tools, including
+alpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of
+benchmarks indicate that the proposed technique can significantly reduce neural
+networks and speed up existing verification tools. Furthermore, the experiment
+results also show that network reduction can improve the availability of
+existing verification tools on many networks by reducing them into sequential
+neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks Provably Benefit from Structural Information: A
+  Feature Learning Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13926v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13926v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Huang, Yuan Cao, Haonan Wang, Xin Cao, Taiji Suzuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have pioneered advancements in graph
+representation learning, exhibiting superior feature learning and performance
+over multilayer perceptrons (MLPs) when handling graph inputs. However,
+understanding the feature learning aspect of GNNs is still in its initial
+stage. This study aims to bridge this gap by investigating the role of graph
+convolution within the context of feature learning theory in neural networks
+using gradient descent training. We provide a distinct characterization of
+signal learning and noise memorization in two-layer graph convolutional
+networks (GCNs), contrasting them with two-layer convolutional neural networks
+(CNNs). Our findings reveal that graph convolution significantly augments the
+benign overfitting regime over the counterpart CNNs, where signal learning
+surpasses noise memorization, by approximately factor $\sqrt{D}^{q-2}$, with
+$D$ denoting a node's expected degree and $q$ being the power of the ReLU
+activation function where $q > 2$. These findings highlight a substantial
+discrepancy between GNNs and MLPs in terms of feature learning and
+generalization capacity after gradient descent training, a conclusion further
+substantiated by our empirical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 7 figures. We have provided a clearer roadmap</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Filters in Graph Convolutional Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.10377v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.10377v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Apicella, Francesco Isgrò, Andrea Pollastro, Roberto Prevete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last few years, we have witnessed the availability of an increasing
+data generated from non-Euclidean domains, which are usually represented as
+graphs with complex relationships, and Graph Neural Networks (GNN) have gained
+a high interest because of their potential in processing graph-structured data.
+In particular, there is a strong interest in exploring the possibilities in
+performing convolution on graphs using an extension of the GNN architecture,
+generally referred to as Graph Convolutional Neural Networks (ConvGNN).
+Convolution on graphs has been achieved mainly in two forms: spectral and
+spatial convolutions. Due to the higher flexibility in exploring and exploiting
+the graph structure of data, there is recently an increasing interest in
+investigating the possibilities that the spatial approach can offer. The idea
+of finding a way to adapt the network behaviour to the inputs they process to
+maximize the total performances has aroused much interest in the neural
+networks literature over the years. This paper presents a novel method to adapt
+the behaviour of a ConvGNN to the input proposing a method to perform spatial
+convolution on graphs using input-specific filters, which are dynamically
+generated from nodes feature vectors. The experimental assessment confirms the
+capabilities of the proposed approach, which achieves satisfying results using
+a low number of filters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been published in its final version on \textit{Pattern
+  Recognition} journal with DOI https://doi.org/10.1016/j.patcog.2023.109867 in
+  Open Access mode. Please consider it as final and peer-reviewed version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-supervised detection of structural damage using Variational
+  Autoencoder and a One-Class Support Vector Machine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05674v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05674v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Pollastro, Giusiana Testa, Antonio Bilotta, Roberto Prevete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Artificial Neural Networks (ANNs) have been introduced in
+Structural Health Monitoring (SHM) systems. A semi-supervised method with a
+data-driven approach allows the ANN training on data acquired from an undamaged
+structural condition to detect structural damages. In standard approaches,
+after the training stage, a decision rule is manually defined to detect
+anomalous data. However, this process could be made automatic using machine
+learning methods, whom performances are maximised using hyperparameter
+optimization techniques. The paper proposes a semi-supervised method with a
+data-driven approach to detect structural anomalies. The methodology consists
+of: (i) a Variational Autoencoder (VAE) to approximate undamaged data
+distribution and (ii) a One-Class Support Vector Machine (OC-SVM) to
+discriminate different health conditions using damage sensitive features
+extracted from VAE's signal reconstruction. The method is applied to a scale
+steel structure that was tested in nine damage's scenarios by IASC-ASCE
+Structural Health Monitoring Task Group.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Logistic-Normal Likelihoods for Heteroscedastic Label Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Englesson, Amir Mehrpanah, Hossein Azizpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A natural way of estimating heteroscedastic label noise in regression is to
+model the observed (potentially noisy) target as a sample from a normal
+distribution, whose parameters can be learned by minimizing the negative
+log-likelihood. This formulation has desirable loss attenuation properties, as
+it reduces the contribution of high-error examples. Intuitively, this behavior
+can improve robustness against label noise by reducing overfitting. We propose
+an extension of this simple and probabilistic approach to classification that
+has the same desirable loss attenuation properties. Furthermore, we discuss and
+address some practical challenges of this extension. We evaluate the
+effectiveness of the method by measuring its robustness against label noise in
+classification. We perform enlightening experiments exploring the inner
+workings of the method, including sensitivity to hyperparameters, ablation
+studies, and other insightful analyses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning in Predictive Autoscaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyan Hao, Zhixuan Chu, Shiyi Zhu, Gangwei Jiang, Yan Wang, Caigao Jiang, James Zhang, Wei Jiang, Siqiao Xue, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive Autoscaling is used to forecast the workloads of servers and
+prepare the resources in advance to ensure service level objectives (SLOs) in
+dynamic cloud environments. However, in practice, its prediction task often
+suffers from performance degradation under abnormal traffics caused by external
+events (such as sales promotional activities and applications
+re-configurations), for which a common solution is to re-train the model with
+data of a long historical period, but at the expense of high computational and
+storage costs. To better address this problem, we propose a replay-based
+continual learning method, i.e., Density-based Memory Selection and Hint-based
+Network Learning Model (DMSHM), using only a small part of the historical log
+to achieve accurate predictions. First, we discover the phenomenon of sample
+overlap when applying replay-based continual learning in prediction tasks. In
+order to surmount this challenge and effectively integrate new sample
+distribution, we propose a density-based sample selection strategy that
+utilizes kernel density estimation to calculate sample density as a reference
+to compute sample weight, and employs weight sampling to construct a new memory
+set. Then we implement hint-based network learning based on hint representation
+to optimize the parameters. Finally, we conduct experiments on public and
+industrial datasets to demonstrate that our proposed method outperforms
+state-of-the-art continual learning methods in terms of memory capacity and
+prediction accuracy. Furthermore, we demonstrate remarkable practicability of
+DMSHM in real industrial applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A jet tagging algorithm of graph network with HaarPooling message
+  passing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13869v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13869v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Ma, Feiyi Liu, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently methods of graph neural networks (GNNs) have been applied to solving
+the problems in high energy physics (HEP) and have shown its great potential
+for quark-gluon tagging with graph representation of jet events. In this paper,
+we introduce an approach of GNNs combined with a HaarPooling operation to
+analyze the events, called HaarPooling Message Passing neural network (HMPNet).
+In HMPNet, HaarPooling not only extracts the features of graph, but embeds
+additional information obtained by clustering of k-means of different particle
+features. We construct Haarpooling from five different features: absolute
+energy $\log E$, transverse momentum $\log p_T$, relative coordinates
+$(\Delta\eta,\Delta\phi)$, the mixed ones $(\log E, \log p_T)$ and $(\log E,
+\log p_T, \Delta\eta,\Delta\phi)$. The results show that an appropriate
+selection of information for HaarPooling enhances the accuracy of quark-gluon
+tagging, as adding extra information of $\log P_T$ to the HMPNet outperforms
+all the others, whereas adding relative coordinates information
+$(\Delta\eta,\Delta\phi)$ is not very effective. This implies that by adding
+effective particle features from HaarPooling can achieve much better results
+than solely pure message passing neutral network (MPNN) can do, which
+demonstrates significant improvement of feature extraction via the pooling
+process. Finally we compare the HMPNet study, ordering by $p_T$, with other
+studies and prove that the HMPNet is also a good choice of GNN algorithms for
+jet tagging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Noise Learning with Intrinsically Long-Tailed Data <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09833v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09833v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Lu, Yiliang Zhang, Bo Han, Yiu-ming Cheung, Hanzi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label noise is one of the key factors that lead to the poor generalization of
+deep learning models. Existing label-noise learning methods usually assume that
+the ground-truth classes of the training data are balanced. However, the
+real-world data is often imbalanced, leading to the inconsistency between
+observed and intrinsic class distribution with label noises. In this case, it
+is hard to distinguish clean samples from noisy samples on the intrinsic tail
+classes with the unknown intrinsic class distribution. In this paper, we
+propose a learning framework for label-noise learning with intrinsically
+long-tailed data. Specifically, we propose two-stage bi-dimensional sample
+selection (TABASCO) to better separate clean samples from noisy samples,
+especially for the tail classes. TABASCO consists of two new separation metrics
+that complement each other to compensate for the limitation of using a single
+metric in sample separation. Extensive experiments on benchmarks demonstrate
+the effectiveness of our method. Our code is available at
+https://github.com/Wakings/TABASCO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Training Instability of Shuffling SGD with Batch Normalization <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David X. Wu, Chulhee Yun, Suvrit Sra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We uncover how SGD interacts with batch normalization and can exhibit
+undesirable training dynamics such as divergence. More precisely, we study how
+Single Shuffle (SS) and Random Reshuffle (RR) -- two widely used variants of
+SGD -- interact surprisingly differently in the presence of batch
+normalization: RR leads to much more stable evolution of training loss than SS.
+As a concrete example, for regression using a linear network with batch
+normalization, we prove that SS and RR converge to distinct global optima that
+are "distorted" away from gradient descent. Thereafter, for classification we
+characterize conditions under which training divergence for SS and RR can, and
+cannot occur. We present explicit constructions to show how SS leads to
+distorted optima in regression and divergence for classification, whereas RR
+avoids both distortion and divergence. We validate our results by confirming
+them empirically in realistic settings, and conclude that the separation
+between SS and RR used with batch normalization is relevant in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 camera-ready version, added references; 75 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model in Causal Inference with Unmeasured Confounders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how to extend the use of the diffusion model to answer the causal
+question from the observational data under the existence of unmeasured
+confounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to
+capture the causal intervention, a Diffusion-based Causal Model (DCM) was
+proposed incorporating the diffusion model to answer the causal questions more
+accurately, assuming that all of the confounders are observed. However,
+unmeasured confounders in practice exist, which hinders DCM from being
+applicable. To alleviate this limitation of DCM, we propose an extended model
+called Backdoor Criterion based DCM (BDCM), whose idea is rooted in the
+Backdoor criterion to find the variables in DAG to be included in the decoding
+process of the diffusion model so that we can extend DCM to the case with
+unmeasured confounders. Synthetic data experiment demonstrates that our
+proposed model captures the counterfactual distribution more precisely than DCM
+under the unmeasured confounders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A stochastic optimization approach to train non-linear neural networks
+  with a higher-order variation regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02293v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02293v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akifumi Okuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While highly expressive parametric models including deep neural networks have
+an advantage to model complicated concepts, training such highly non-linear
+models is known to yield a high risk of notorious overfitting. To address this
+issue, this study considers a $(k,q)$th order variation regularization
+($(k,q)$-VR), which is defined as the $q$th-powered integral of the absolute
+$k$th order derivative of the parametric models to be trained; penalizing the
+$(k,q)$-VR is expected to yield a smoother function, which is expected to avoid
+overfitting. Particularly, $(k,q)$-VR encompasses the conventional
+(general-order) total variation with $q=1$. While the $(k,q)$-VR terms applied
+to general parametric models are computationally intractable due to the
+integration, this study provides a stochastic optimization algorithm, that can
+efficiently train general models with the $(k,q)$-VR without conducting
+explicit numerical integration. The proposed approach can be applied to the
+training of even deep neural networks whose structure is arbitrary, as it can
+be implemented by only a simple stochastic gradient descent algorithm and
+automatic differentiation. Our numerical experiments demonstrate that the
+neural networks trained with the $(k,q)$-VR terms are more ``resilient'' than
+those with the conventional parameter regularization. The proposed algorithm
+also can be extended to the physics-informed training of neural networks
+(PINNs).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 24 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TARGET: Federated Class-Continual Learning via Exemplar-Free
+  Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06937v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06937v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhang, Chen Chen, Weiming Zhuang, Lingjuan Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on an under-explored yet important problem: Federated
+Class-Continual Learning (FCCL), where new classes are dynamically added in
+federated learning. Existing FCCL works suffer from various limitations, such
+as requiring additional datasets or storing the private data from previous
+tasks. In response, we first demonstrate that non-IID data exacerbates
+catastrophic forgetting issue in FL. Then we propose a novel method called
+TARGET (federat\textbf{T}ed cl\textbf{A}ss-continual lea\textbf{R}nin\textbf{G}
+via \textbf{E}xemplar-free dis\textbf{T}illation), which alleviates
+catastrophic forgetting in FCCL while preserving client data privacy. Our
+proposed method leverages the previously trained global model to transfer
+knowledge of old tasks to the current task at the model level. Moreover, a
+generator is trained to produce synthetic data to simulate the global
+distribution of data on each client at the data level. Compared to previous
+FCCL methods, TARGET does not require any additional datasets or storing real
+data from previous tasks, which makes it ideal for data-sensitive scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-effective On-device Continual Learning over Memory Hierarchy with
+  Miro 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Ma, Suyeon Jeong, Minjia Zhang, Di Wang, Jonghyun Choi, Myeongjae Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) trains NN models incrementally from a continuous
+stream of tasks. To remember previously learned knowledge, prior studies store
+old samples over a memory hierarchy and replay them when new tasks arrive. Edge
+devices that adopt CL to preserve data privacy are typically energy-sensitive
+and thus require high model accuracy while not compromising energy efficiency,
+i.e., cost-effectiveness. Our work is the first to explore the design space of
+hierarchical memory replay-based CL to gain insights into achieving
+cost-effectiveness on edge devices. We present Miro, a novel system runtime
+that carefully integrates our insights into the CL framework by enabling it to
+dynamically configure the CL system based on resource states for the best
+cost-effectiveness. To reach this goal, Miro also performs online profiling on
+parameters with clear accuracy-energy trade-offs and adapts to optimal values
+with low overhead. Extensive evaluations show that Miro significantly
+outperforms baseline systems we build for comparison, consistently achieving
+higher cost-effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is to be published in the 29th Annual International
+  Conference on Mobile Computing and Networking (ACM MobiCom 23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Implicit Framework for Fast NeRF Composition and Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Gao, Ziyi Yang, Yunlu Zhao, Yuxiang Sun, Xiaogang Jin, Changqing Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A variety of Neural Radiance Fields (NeRF) methods have recently achieved
+remarkable success in high render speed. However, current accelerating methods
+are specialized and incompatible with various implicit methods, preventing
+real-time composition over various types of NeRF works. Because NeRF relies on
+sampling along rays, it is possible to provide general guidance for
+acceleration. To that end, we propose a general implicit pipeline for composing
+NeRF objects quickly. Our method enables the casting of dynamic shadows within
+or between objects using analytical light sources while allowing multiple NeRF
+objects to be seamlessly placed and rendered together with any arbitrary rigid
+transformations. Mainly, our work introduces a new surface representation known
+as Neural Depth Fields (NeDF) that quickly determines the spatial relationship
+between objects by allowing direct intersection computation between rays and
+implicit surfaces. It leverages an intersection neural network to query NeRF
+for acceleration instead of depending on an explicit spatial structure.Our
+proposed method is the first to enable both the progressive and interactive
+composition of NeRF objects. Additionally, it also serves as a previewing
+plugin for a range of existing NeRF works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages for main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Embarrassingly Simple Backdoor Attack on <span class="highlight-title">Self-supervised</span> Learning <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changjiang Li, Ren Pang, Zhaohan Xi, Tianyu Du, Shouling Ji, Yuan Yao, Ting Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a new paradigm in machine learning, self-supervised learning (SSL) is
+capable of learning high-quality representations of complex data without
+relying on labels. In addition to eliminating the need for labeled data,
+research has found that SSL improves the adversarial robustness over supervised
+learning since lacking labels makes it more challenging for adversaries to
+manipulate model predictions. However, the extent to which this robustness
+superiority generalizes to other types of attacks remains an open question.
+  We explore this question in the context of backdoor attacks. Specifically, we
+design and evaluate CTRL, an embarrassingly simple yet highly effective
+self-supervised backdoor attack. By only polluting a tiny fraction of training
+data (<= 1%) with indistinguishable poisoning samples, CTRL causes any
+trigger-embedded input to be misclassified to the adversary's designated class
+with a high probability (>= 99%) at inference time. Our findings suggest that
+SSL and supervised learning are comparably vulnerable to backdoor attacks. More
+importantly, through the lens of CTRL, we study the inherent vulnerability of
+SSL to backdoor attacks. With both empirical and analytical evidence, we reveal
+that the representation invariance property of SSL, which benefits adversarial
+robustness, may also be the very reason making \ssl highly susceptible to
+backdoor attacks. Our findings also imply that the existing defenses against
+supervised backdoor attacks are not easily retrofitted to the unique
+vulnerability of SSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 2023 International Conference on Computer Vision (ICCV '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fisher-Rao distance and pullback SPD cone distances between multivariate
+  normal distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data sets of multivariate normal distributions abound in many scientific
+areas like diffusion tensor imaging, structure tensor computer vision, radar
+signal processing, machine learning, just to name a few. In order to process
+those normal data sets for downstream tasks like filtering, classification or
+clustering, one needs to define proper notions of dissimilarities between
+normals and paths joining them. The Fisher-Rao distance defined as the
+Riemannian geodesic distance induced by the Fisher information metric is such a
+principled metric distance which however is not known in closed-form excepts
+for a few particular cases. In this work, we first report a fast and robust
+method to approximate arbitrarily finely the Fisher-Rao distance between
+multivariate normal distributions. Second, we introduce a class of distances
+based on diffeomorphic embeddings of the normal manifold into a submanifold of
+the higher-dimensional symmetric positive-definite cone corresponding to the
+manifold of centered normal distributions. We show that the projective Hilbert
+distance on the cone yields a metric on the embedded normal submanifold and we
+pullback that cone distance with its associated straight line Hilbert cone
+geodesics to obtain a distance and smooth paths between normal distributions.
+Compared to the Fisher-Rao distance approximation, the pullback Hilbert cone
+distance is computationally light since it requires to compute only the extreme
+minimal and maximal eigenvalues of matrices. Finally, we show how to use those
+distances in clustering tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent-Controller Representations: Principled Offline RL with Rich
+  Exogenous Information <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riashat Islam, Manan Tomar, Alex Lamb, Yonathan Efroni, Hongyu Zang, Aniket Didolkar, Dipendra Misra, Xin Li, Harm van Seijen, Remi Tachet des Combes, John Langford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning to control an agent from data collected offline in a rich
+pixel-based visual observation space is vital for real-world applications of
+reinforcement learning (RL). A major challenge in this setting is the presence
+of input information that is hard to model and irrelevant to controlling the
+agent. This problem has been approached by the theoretical RL community through
+the lens of exogenous information, i.e, any control-irrelevant information
+contained in observations. For example, a robot navigating in busy streets
+needs to ignore irrelevant information, such as other people walking in the
+background, textures of objects, or birds in the sky. In this paper, we focus
+on the setting with visually detailed exogenous information, and introduce new
+offline RL benchmarks offering the ability to study this problem. We find that
+contemporary representation learning techniques can fail on datasets where the
+noise is a complex and time dependent process, which is prevalent in practical
+applications. To address these, we propose to use multi-step inverse models,
+which have seen a great deal of interest in the RL theory community, to learn
+Agent-Controller Representations for Offline-RL (ACRO). Despite being simple
+and requiring no reward, we show theoretically and empirically that the
+representation created by this objective greatly outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jurassic World Remake: Bringing Ancient Fossils Back to Life via
+  Zero-Shot Long Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Martin, Haitian Zheng, Jie An, Jiebo Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With a strong understanding of the target domain from natural language, we
+produce promising results in translating across large domain gaps and bringing
+skeletons back to life. In this work, we use text-guided latent diffusion
+models for zero-shot image-to-image translation (I2I) across large domain gaps
+(longI2I), where large amounts of new visual features and new geometry need to
+be generated to enter the target domain. Being able to perform translations
+across large domain gaps has a wide variety of real-world applications in
+criminology, astrology, environmental conservation, and paleontology. In this
+work, we introduce a new task Skull2Animal for translating between skulls and
+living animals. On this task, we find that unguided Generative Adversarial
+Networks (GANs) are not capable of translating across large domain gaps.
+Instead of these traditional I2I methods, we explore the use of guided
+diffusion and image editing models and provide a new benchmark model,
+Revive-2I, capable of performing zero-shot I2I via text-prompting latent
+diffusion models. We find that guidance is necessary for longI2I because, to
+bridge the large domain gap, prior knowledge about the target domain is needed.
+In addition, we find that prompting provides the best and most scalable
+information about the target domain as classifier-guided diffusion models
+require retraining for specific use cases and lack stronger constraints on the
+target domain because of the wide variety of images they are trained on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTP: Towards Vision-Language Continual <span class="highlight-title">Pretrain</span>ing via Compatible
+  Momentum Contrast and Topology Preservation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongguang Zhu, Yunchao Wei, Xiaodan Liang, Chunjie Zhang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Pretraining (VLP) has shown impressive results on diverse
+downstream tasks by offline training on large-scale datasets. Regarding the
+growing nature of real-world data, such an offline training paradigm on
+ever-expanding data is unsustainable, because models lack the continual
+learning ability to accumulate knowledge constantly. However, most continual
+learning studies are limited to uni-modal classification and existing
+multi-modal datasets cannot simulate continual non-stationary data stream
+scenarios. To support the study of Vision-Language Continual Pretraining
+(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D
+which contains over one million product image-text pairs from 9 industries. The
+data from each industry as an independent task supports continual learning and
+conforms to the real-world long-tail nature to simulate pretraining on web
+data. We comprehensively study the characteristics and challenges of VLCP, and
+propose a new algorithm: Compatible momentum contrast with Topology
+Preservation, dubbed CTP. The compatible momentum model absorbs the knowledge
+of the current and previous-task models to flexibly update the modal feature.
+Moreover, Topology Preservation transfers the knowledge of embedding across
+tasks while preserving the flexibility of feature adjustment. The experimental
+results demonstrate our method not only achieves superior performance compared
+with other baselines but also does not bring an expensive training burden.
+Dataset and codes are available at https://github.com/KevinLight831/CTP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Sentence Grounding in Streaming Videos <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian Gan, Xiao Wang, Yan Sun, Jianlong Wu, Qingpei Guo, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to tackle a novel task - Temporal Sentence Grounding in
+Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance
+between a video stream and a given sentence query. Unlike regular videos,
+streaming videos are acquired continuously from a particular source, and are
+always desired to be processed on-the-fly in many applications such as
+surveillance and live-stream analysis. Thus, TSGSV is challenging since it
+requires the model to infer without future frames and process long historical
+frames effectively, which is untouched in the early methods. To specifically
+address the above challenges, we propose two novel methods: (1) a TwinNet
+structure that enables the model to learn about upcoming events; and (2) a
+language-guided feature compressor that eliminates redundant visual frames and
+reinforces the frames that are relevant to the query. We conduct extensive
+experiments using ActivityNet Captions, TACoS, and MAD datasets. The results
+demonstrate the superiority of our proposed methods. A systematic ablation
+study also confirms their effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VoxSnap: X-Large Speaker Verification <span class="highlight-title">Dataset</span> on Camera <span class="chip">ICASSP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07056v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07056v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuke Lin, Xiaoyi Qin, Ming Cheng, Ning Jiang, Guoqing Zhao, Ming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we contribute a novel and extensive dataset for speaker
+verification, which contains noisy 38k identities/1.45M utterances (VoxSnap)
+and relatively cleaned 18k identities/1.02M (VoxSnap-Clean) utterances for
+training. Firstly, we collect a 60K+ users' list as well as their avatar and
+download their SHORT videos on the YouTube. Then, an automatically pipeline is
+devised to extract target user's speech segments and videos, which is efficient
+and scalable. To the best of our knowledge, the VoxSnap dataset is the largest
+speaker recognition dataset. Secondly, we develop a series of experiments based
+on VoxSnap-clean together with VoxCeleb2. Our findings highlight a notable
+improvement in performance, ranging from 15% to 30%, across different backbone
+architectures, upon integrating our dataset for training. The dataset will be
+released SOON~.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submit to ICASSP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orthogonal Temporal Interpolation for Zero-Shot Video Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zhu, Junbao Zhuo, Bin Ma, Jiajia Geng, Xiaoming Wei, Xiaolin Wei, Shuhui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot video recognition (ZSVR) is a task that aims to recognize video
+categories that have not been seen during the model training process. Recently,
+vision-language models (VLMs) pre-trained on large-scale image-text pairs have
+demonstrated impressive transferability for ZSVR. To make VLMs applicable to
+the video domain, existing methods often use an additional temporal learning
+module after the image-level encoder to learn the temporal relationships among
+video frames. Unfortunately, for video from unseen categories, we observe an
+abnormal phenomenon where the model that uses spatial-temporal feature performs
+much worse than the model that removes temporal learning module and uses only
+spatial feature. We conjecture that improper temporal modeling on video
+disrupts the spatial feature of the video. To verify our hypothesis, we propose
+Feature Factorization to retain the orthogonal temporal feature of the video
+and use interpolation to construct refined spatial-temporal feature. The model
+using appropriately refined spatial-temporal feature performs better than the
+one using only spatial feature, which verifies the effectiveness of the
+orthogonal temporal feature for the ZSVR task. Therefore, an Orthogonal
+Temporal Interpolation module is designed to learn a better refined
+spatial-temporal video feature during training. Additionally, a Matching Loss
+is introduced to improve the quality of the orthogonal temporal feature. We
+propose a model called OTI for ZSVR by employing orthogonal temporal
+interpolation and the matching loss based on VLMs. The ZSVR accuracies on
+popular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI
+outperforms the previous state-of-the-art method by a clear margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability
+  for Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Peirong Ning, Jiayu Yang, Yongqi Zhai, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, transformers are trending as replacements for CNNs in vision tasks,
+including compression. This trend compels us to question the inherent
+limitations of CNNs compared to transformers and to explore if CNNs can be
+enhanced to achieve the same or even better performance than transformers. We
+want to design a pure CNN based model for compression as most devices are
+optimized for CNNs well. In our analysis, we find that the key strengths of
+transformers lie in their dynamic weights and large receptive fields. To enable
+CNNs with such properties, we propose a novel transform module with large
+receptive filed learning and self-conditioned adaptability for learned image
+compression, named SLIC. Specifically, we enlarge the receptive field of
+depth-wise convolution with suitable complexity and generate the weights
+according to given conditions. In addition, we also investigate the
+self-conditioned factor for channels. To prove the effectiveness of our
+proposed transform module, we equip it with existing entropy models ChARM,
+SCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and
+SLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and
+SLIC-SWAtten have significant improvements over corresponding baselines and
+achieve SOTA performances with suitable complexity on 5 test datasets (Kodak,
+Tecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at
+https://github.com/JiangWeibeta/SLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale
+  Multi-Attribute and Language Search Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02898v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02898v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyu Yang, Yinan Zhou, Yaxiong Wang, Yujiao Wu, Li Zhu, Zhedong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a large Multi-Attribute and Language Search
+dataset for text-based person retrieval, called MALS, and explore the
+feasibility of performing pre-training on both attribute recognition and
+image-text matching tasks in one stone. In particular, MALS contains 1,510,330
+image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,
+and all images are annotated with 27 attributes. Considering the privacy
+concerns and annotation costs, we leverage the off-the-shelf diffusion models
+to generate the dataset. To verify the feasibility of learning from the
+generated data, we develop a new joint Attribute Prompt Learning and Text
+Matching Learning (APTM) framework, considering the shared knowledge between
+attribute and text. As the name implies, APTM contains an attribute prompt
+learning stream and a text matching learning stream. (1) The attribute prompt
+learning leverages the attribute prompts for image-attribute alignment, which
+enhances the text matching learning. (2) The text matching learning facilitates
+the representation learning on fine-grained details, and in turn, boosts the
+attribute prompt learning. Extensive experiments validate the effectiveness of
+the pre-training on MALS, achieving state-of-the-art retrieval performance via
+APTM on three challenging real-world benchmarks. In particular, APTM achieves a
+consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on
+CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio is all in one: speech-driven gesture synthetics using WavLM
+  <span class="highlight-title">pre-train</span>ed model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Zhang, Naye Ji, Fuxing Gao, Siyuan Zhao, Zhaohan Wang, Shunman Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of co-speech gestures for digital humans is an emerging area
+in the field of virtual human creation. Prior research has made progress by
+using acoustic and semantic information as input and adopting classify method
+to identify the person's ID and emotion for driving co-speech gesture
+generation. However, this endeavour still faces significant challenges. These
+challenges go beyond the intricate interplay between co-speech gestures, speech
+acoustic, and semantics; they also encompass the complexities associated with
+personality, emotion, and other obscure but important factors. This paper
+introduces "diffmotion-v2," a speech-conditional diffusion-based and
+non-autoregressive transformer-based generative model with WavLM pre-trained
+model. It can produce individual and stylized full-body co-speech gestures only
+using raw speech audio, eliminating the need for complex multimodal processing
+and manually annotated. Firstly, considering that speech audio not only
+contains acoustic and semantic features but also conveys personality traits,
+emotions, and more subtle information related to accompanying gestures, we
+pioneer the adaptation of WavLM, a large-scale pre-trained model, to extract
+low-level and high-level audio information. Secondly, we introduce an adaptive
+layer norm architecture in the transformer-based layer to learn the
+relationship between speech information and accompanying gestures. Extensive
+subjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT
+datasets to confirm the WavLM and the model's ability to synthesize natural
+co-speech gestures with various styles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-13T00:00:00Z">2023-08-13</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagnostic Reasoning <span class="highlight-title">Prompt</span>s Reveal the Potential for Large Language
+  Model Interpretability in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Savage, Ashwin Nayak, Robert Gallo, Ekanath Rangan, Jonathan H Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major barriers to using large language models (LLMs) in medicine
+is the perception they use uninterpretable methods to make clinical decisions
+that are inherently different from the cognitive processes of clinicians. In
+this manuscript we develop novel diagnostic reasoning prompts to study whether
+LLMs can perform clinical reasoning to accurately form a diagnosis. We find
+that GPT4 can be prompted to mimic the common clinical reasoning processes of
+clinicians without sacrificing diagnostic accuracy. This is significant because
+an LLM that can use clinical reasoning to provide an interpretable rationale
+offers physicians a means to evaluate whether LLMs can be trusted for patient
+care. Novel prompting methods have the potential to expose the black box of
+LLMs, bringing them one step closer to safe and effective use in medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Ensemble Approach to Question Classification: Integrating Electra
+  <span class="highlight-title">Transformer</span>, GloVe, and LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel ensemble approach for question classification
+using state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model
+is trained and evaluated on the TREC dataset, a well-established benchmark for
+question classification tasks. The ensemble model combines the strengths of
+Electra, a transformer-based model for language understanding, GloVe, a global
+vectors for word representation, and LSTM, a recurrent neural network variant,
+providing a robust and efficient solution for question classification.
+Extensive experiments were carried out to compare the performance of the
+proposed ensemble approach with other cutting-edge models, such as BERT,
+RoBERTa, and DistilBERT. Our results demonstrate that the ensemble model
+outperforms these models across all evaluation metrics, achieving an accuracy
+of 0.8 on the test set. These findings underscore the effectiveness of the
+ensemble approach in enhancing the performance of question classification
+tasks, and invite further exploration of ensemble methods in natural language
+processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faithful to Whom? Questioning Interpretability Measures in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evan Crothers, Herna Viktor, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to quantifying model interpretability is to calculate
+faithfulness metrics based on iteratively masking input tokens and measuring
+how much the predicted label changes as a result. However, we show that such
+metrics are generally not suitable for comparing the interpretability of
+different neural text classifiers as the response to masked inputs is highly
+model-specific. We demonstrate that iterative masking can produce large
+variation in faithfulness scores between comparable models, and show that
+masked samples are frequently outside the distribution seen during training. We
+further investigate the impact of adversarial attacks and adversarial training
+on faithfulness scores, and demonstrate the relevance of faithfulness measures
+for analyzing feature salience in text adversarial attacks. Our findings
+provide new insights into the limitations of current faithfulness metrics and
+key considerations to utilize them appropriately.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling the Dashboard Provenance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johne Jarske, Jorge Rady, Lucia V. L. Filgueiras, Leandro M. Velloso, Tania L. Santos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Organizations of all kinds, whether public or private, profit-driven or
+non-profit, and across various industries and sectors, rely on dashboards for
+effective data visualization. However, the reliability and efficacy of these
+dashboards rely on the quality of the visual and data they present. Studies
+show that less than a quarter of dashboards provide information about their
+sources, which is just one of the expected metadata when provenance is
+seriously considered. Provenance is a record that describes people,
+organizations, entities, and activities that had a role in the production,
+influence, or delivery of a piece of data or an object. This paper aims to
+provide a provenance representation model, that entitles standardization,
+modeling, generation, capture, and visualization, specifically designed for
+dashboards and its visual and data components. The proposed model will offer a
+comprehensive set of essential provenance metadata that enables users to
+evaluate the quality, consistency, and reliability of the information presented
+on dashboards. This will allow a clear and precise understanding of the context
+in which a specific dashboard was developed, ultimately leading to better
+decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures, one table, to be published in VIS 2023 (Vis +
+  Prov) x Domain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Token-Scaled Logit Distillation for Ternary Weight Generative Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsoo Kim, Sihwa Lee, Janghwan Lee, Sukjin Hong, Du-Seong Chang, Wonyong Sung, Jungwook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Language Models (GLMs) have shown impressive performance in tasks
+such as text generation, understanding, and reasoning. However, the large model
+size poses challenges for practical deployment. To solve this problem,
+Quantization-Aware Training (QAT) has become increasingly popular. However,
+current QAT methods for generative models have resulted in a noticeable loss of
+accuracy. To counteract this issue, we propose a novel knowledge distillation
+method specifically designed for GLMs. Our method, called token-scaled logit
+distillation, prevents overfitting and provides superior learning from the
+teacher model and ground truth. This research marks the first evaluation of
+ternary weight quantization-aware training of large-scale GLMs with less than
+1.0 degradation in perplexity and no loss of accuracy in a reasoning task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MACO: A Modality Adversarial and Contrastive Framework for
+  Modality-missing Multi-modal Knowledge Graph Completion <span class="chip">NLPCC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Zhuo Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen significant advancements in multi-modal knowledge
+graph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by
+integrating multi-modal entity information, thereby facilitating the discovery
+of unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,
+existing methods emphasize the design of elegant KGC models to facilitate
+modality interaction, neglecting the real-life problem of missing modalities in
+KGs. The missing modality information impedes modal interaction, consequently
+undermining the model's performance. In this paper, we propose a modality
+adversarial and contrastive framework (MACO) to solve the modality-missing
+problem in MMKGC. MACO trains a generator and discriminator adversarially to
+generate missing modality features that can be incorporated into the MMKGC
+model. Meanwhile, we design a cross-modal contrastive loss to improve the
+performance of the generator. Experiments on public benchmarks with further
+explorations demonstrate that MACO could achieve state-of-the-art results and
+serve as a versatile framework to bolster various MMKGC models. Our code and
+benchmark data are available at https://github.com/zjukg/MACO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the ArXiv version of our paper accepted by NLPCC 2023. The
+  code will be released soon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PInKS: Preconditioned Commonsense Inference with Minimal Supervision <span class="chip">AACL 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07920v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07920v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ehsan Qasemi, Piyush Khanna, Qiang Ning, Muhao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning with preconditions such as "glass can be used for drinking water
+unless the glass is shattered" remains an open problem for language models. The
+main challenge lies in the scarcity of preconditions data and the model's lack
+of support for such reasoning. We present PInKS, Preconditioned Commonsense
+Inference with WeaK Supervision, an improved model for reasoning with
+preconditions through minimum supervision. We show, both empirically and
+theoretically, that PInKS improves the results on benchmarks focused on
+reasoning with the preconditions of commonsense knowledge (up to 40% Macro-F1
+scores). We further investigate PInKS through PAC-Bayesian informativeness
+analysis, precision measures, and ablation study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AACL 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PaCo: Preconditions Attributed to Commonsense Knowledge <span class="chip">EMNLP 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.08712v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.08712v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ehsan Qasemi, Filip Ilievski, Muhao Chen, Pedro Szekely
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans can seamlessly reason with circumstantial preconditions of commonsense
+knowledge. We understand that a glass is used for drinking water, unless the
+glass is broken or the water is toxic. Despite state-of-the-art (SOTA) language
+models' (LMs) impressive performance on inferring commonsense knowledge, it is
+unclear whether they understand the circumstantial preconditions. To address
+this gap, we propose a novel challenge of reasoning with circumstantial
+preconditions. We collect a dataset, called PaCo, consisting of 12.4 thousand
+preconditions of commonsense statements expressed in natural language. Based on
+this dataset, we create three canonical evaluation tasks and use them to
+examine the capability of existing LMs to understand situational preconditions.
+Our results reveal a 10-30% gap between machine and human performance on our
+tasks, which shows that reasoning with preconditions is an open challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2022 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-View Zero-Shot Open Intent Induction from Dialogues: Multi Domain
+  Batch and Proxy Gradient Transfer <span class="chip">SIGDIAL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13099v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13099v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyukhun Koh, Haesung Pyun, Nakyeong Yang, Kyomin Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Task Oriented Dialogue (TOD) system, detecting and inducing new intents
+are two main challenges to apply the system in the real world. In this paper,
+we suggest the semantic multi-view model to resolve these two challenges: (1)
+SBERT for General Embedding (GE), (2) Multi Domain Batch (MDB) for dialogue
+domain knowledge, and (3) Proxy Gradient Transfer (PGT) for cluster-specialized
+semantic. MDB feeds diverse dialogue datasets to the model at once to tackle
+the multi-domain problem by learning the multiple domain knowledge. We
+introduce a novel method PGT, which employs the Siamese network to fine-tune
+the model with a clustering method directly.Our model can learn how to cluster
+dialogue utterances by using PGT. Experimental results demonstrate that our
+multi-view model with MDB and PGT significantly improves the Open Intent
+Induction performance compared to baseline systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, SIGDIAL DSTC 2023 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine
+  Lexicon-based Retriever <span class="chip">NLPCC2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijue Huang, Bingbing Wang, Libo Qin, Qin Zhao, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot and zero-shot entity linking focus on the tail and emerging
+entities, which are more challenging but closer to real-world scenarios. The
+mainstream method is the ''retrieve and rerank'' two-stage framework. In this
+paper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity
+candidates in an effective manner, which operates in two layers. The first
+layer retrieves coarse-grained candidates by leveraging entity names, while the
+second layer narrows down the search to fine-grained candidates within the
+coarse-grained ones. In addition, this second layer utilizes entity
+descriptions to effectively disambiguate tail or new entities that share names
+with existing popular entities. Experimental results indicate that our approach
+can obtain superior performance without requiring extensive finetuning in the
+retrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task
+6 on Chinese Few-shot and Zero-shot Entity Linking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NLPCC2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label<span class="highlight-title">Prompt</span>: Effective <span class="highlight-title">Prompt</span>-based Learning for Relation Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Zhang, Xiaoning Song, Zhenhua Feng, Tianyang Xu, Xiaojun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, prompt-based learning has gained popularity across many natural
+language processing (NLP) tasks by reformulating them into a cloze-style format
+to better align pre-trained language models (PLMs) with downstream tasks.
+However, applying this approach to relation classification poses unique
+challenges. Specifically, associating natural language words that fill the
+masked token with semantic relation labels (\textit{e.g.}
+\textit{``org:founded\_by}'') is difficult. To address this challenge, this
+paper presents a novel prompt-based learning method, namely LabelPrompt, for
+the relation classification task. Motivated by the intuition to ``GIVE MODEL
+CHOICES!'', we first define additional tokens to represent relation labels,
+which regard these tokens as the verbaliser with semantic initialisation and
+explicitly construct them with a prompt template method. Then, to mitigate
+inconsistency between predicted relations and given entities, we implement an
+entity-aware module with contrastive learning. Last, we conduct an attention
+query strategy within the self-attention layer to differentiates prompt tokens
+and sequence tokens. Together, these strategies enhance the adaptability of
+prompt-based learning, especially when only small labelled datasets is
+available. Comprehensive experiments on benchmark datasets demonstrate the
+superiority of our method, particularly in the few-shot scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">41</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Face Recognition from Caption Supervision with Multi-Granular
+  Contextual Feature Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Mahedi Hasan, Nasser Nasrabadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce caption-guided face recognition (CGFR) as a new framework to
+improve the performance of commercial-off-the-shelf (COTS) face recognition
+(FR) systems. In contrast to combining soft biometrics (eg., facial marks,
+gender, and age) with face images, in this work, we use facial descriptions
+provided by face examiners as a piece of auxiliary information. However, due to
+the heterogeneity of the modalities, improving the performance by directly
+fusing the textual and facial features is very challenging, as both lie in
+different embedding spaces. In this paper, we propose a contextual feature
+aggregation module (CFAM) that addresses this issue by effectively exploiting
+the fine-grained word-region interaction and global image-caption association.
+Specifically, CFAM adopts a self-attention and a cross-attention scheme for
+improving the intra-modality and inter-modality relationship between the image
+and textual features, respectively. Additionally, we design a textual feature
+refinement module (TFRM) that refines the textual features of the pre-trained
+BERT encoder by updating the contextual embeddings. This module enhances the
+discriminative power of textual features with a cross-modal projection loss and
+realigns the word and caption embeddings with visual features by incorporating
+a visual-semantic alignment loss. We implemented the proposed CGFR framework on
+two face recognition models (ArcFace and AdaFace) and evaluated its performance
+on the Multi-Modal CelebA-HQ dataset. Our framework significantly improves the
+performance of ArcFace in both 1:1 verification and 1:N identification
+protocol.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in the IEEE
+  International Joint Conference on Biometrics (IJCB), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Manifold DivideMix: A Semi-Supervised Contrastive Learning Framework for
+  Severe Label Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahimeh Fooladgar, Minh Nguyen Nhat To, Parvin Mousavi, Purang Abolmaesumi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have proven to be highly effective when large amounts of
+data with clean labels are available. However, their performance degrades when
+training data contains noisy labels, leading to poor generalization on the test
+set. Real-world datasets contain noisy label samples that either have similar
+visual semantics to other classes (in-distribution) or have no semantic
+relevance to any class (out-of-distribution) in the dataset. Most
+state-of-the-art methods leverage ID labeled noisy samples as unlabeled data
+for semi-supervised learning, but OOD labeled noisy samples cannot be used in
+this way because they do not belong to any class within the dataset. Hence, in
+this paper, we propose incorporating the information from all the training data
+by leveraging the benefits of self-supervised training. Our method aims to
+extract a meaningful and generalizable embedding space for each sample
+regardless of its label. Then, we employ a simple yet effective K-nearest
+neighbor method to remove portions of out-of-distribution samples. By
+discarding these samples, we propose an iterative "Manifold DivideMix"
+algorithm to find clean and noisy samples, and train our model in a
+semi-supervised way. In addition, we propose "MixEMatch", a new algorithm for
+the semi-supervised step that involves mixup augmentation at the input and
+final hidden representations of the model. This will extract better
+representations by interpolating both in the input and manifold spaces.
+Extensive experiments on multiple synthetic-noise image benchmarks and
+real-world web-crawled datasets demonstrate the effectiveness of our proposed
+framework. Code is available at https://github.com/Fahim-F/ManifoldDivideMix.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UGC Quality Assessment: Exploring the Impact of Saliency in Deep
+  Feature-Based Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wang, Angeliki Katsenou, David Bull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volume of User Generated Content (UGC) has increased in recent years. The
+challenge with this type of content is assessing its quality. So far, the
+state-of-the-art metrics are not exhibiting a very high correlation with
+perceptual quality. In this paper, we explore state-of-the-art metrics that
+extract/combine natural scene statistics and deep neural network features. We
+experiment with these by introducing saliency maps to improve perceptibility.
+We train and test our models using public datasets, namely, YouTube-UGC and
+KoNViD-1k. Preliminary results indicate that high correlations are achieved by
+using only deep features while adding saliency is not always boosting the
+performance. Our results and code will be made publicly available to serve as a
+benchmark for the research community and can be found on our project page:
+https://github.com/xinyiW915/SPIE-2023-Supplementary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Brain Tumor Classification: A Comprehensive Study on Transfer
+  Learning and Imbalance Handling in Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raza Imam, Mohammed Talha Alam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has emerged as a prominent field in recent literature,
+showcasing the introduction of models that utilize transfer learning to achieve
+remarkable accuracies in the classification of brain tumor MRI images. However,
+the majority of these proposals primarily focus on balanced datasets,
+neglecting the inherent data imbalance present in real-world scenarios.
+Consequently, there is a pressing need for approaches that not only address the
+data imbalance but also prioritize precise classification of brain cancer. In
+this work, we present a novel deep learning-based approach, called Transfer
+Learning-CNN, for brain tumor classification using MRI data. The proposed model
+leverages the predictive capabilities of existing publicly available models by
+utilizing their pre-trained weights and transferring those weights to the CNN.
+By leveraging a publicly available Brain MRI dataset, the experiment evaluated
+various transfer learning models for classifying different tumor types,
+including meningioma, glioma, and pituitary tumors. We investigate the impact
+of different loss functions, including focal loss, and oversampling methods,
+such as SMOTE and ADASYN, in addressing the data imbalance issue. Notably, the
+proposed strategy, which combines VGG-16 and CNN, achieved an impressive
+accuracy rate of 96%, surpassing alternative approaches significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is available at
+  https://github.com/Razaimam45/AI701-Project-Transfer-Learning-approach-for-imbalance-classification-of-Brain-Tumor-MRI-</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modified Topological Image Preprocessing for Skin Lesion Classifications <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Cheng, Rebekah Leamons, Ahmad Al Shami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a modified Topological Data Analysis model for skin
+images preprocessing and enhancements. The skin lesion dataset HAM10000 used
+with the intention of identifying the important objects in relevant regions of
+the images. In order to evaluate both the original dataset and the preprocessed
+dataset, Deep Convolutional Neural Network and Vision Transformer models were
+utilized to train both models. After training, the experimental results
+demonstrate that the images preprocessed using the Modified Topological Data
+Analysis consistently perform better.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at CSCE 2022, The 2022 World Congress in Computer Science,
+  Computer Engineering & Applied Computing, July 25-28, 2022, Las Vegas, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D
+  Object Detector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongxin Shao, Aihong Tan, Zhetao Sun, Enhui Zheng, Tianhong Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LIDAR-based 3D object detection and classification is crucial for autonomous
+driving. However, inference in real-time from extremely sparse 3D data poses a
+formidable challenge. To address this issue, a common approach is to project
+point clouds onto a bird's-eye or perspective view, effectively converting them
+into an image-like data format. However, this excessive compression of point
+cloud data often leads to the loss of information. This paper proposes a 3D
+object detector based on voxel and projection double branch feature extraction
+(PV-SSD) to address the problem of information loss. We add voxel features
+input containing rich local semantic information, which is fully fused with the
+projected features in the feature extraction stage to reduce the local
+information loss caused by projection. A good performance is achieved compared
+to the previous work. In addition, this paper makes the following
+contributions: 1) a voxel feature extraction method with variable receptive
+fields is proposed; 2) a feature point sampling method by weight sampling is
+used to filter out the feature points that are more conducive to the detection
+task; 3) the MSSFA module is proposed based on the SSFA module. To verify the
+effectiveness of our method, we designed comparison experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RMP-Loss: Regularizing Membrane Potential Distribution for Spiking
+  Neural Networks <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Guo, Xiaode Liu, Yuanpei Chen, Liwen Zhang, Weihang Peng, Yuhan Zhang, Xuhui Huang, Zhe Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) as one of the biology-inspired models have
+received much attention recently. It can significantly reduce energy
+consumption since they quantize the real-valued membrane potentials to 0/1
+spikes to transmit information thus the multiplications of activations and
+weights can be replaced by additions when implemented on hardware. However,
+this quantization mechanism will inevitably introduce quantization error, thus
+causing catastrophic information loss. To address the quantization error
+problem, we propose a regularizing membrane potential loss (RMP-Loss) to adjust
+the distribution which is directly related to quantization error to a range
+close to the spikes. Our method is extremely simple to implement and
+straightforward to train an SNN. Furthermore, it is shown to consistently
+outperform previous state-of-the-art methods over different network
+architectures and datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape-guided Conditional Latent Diffusion Models for Synthesising Brain
+  Vasculature 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yash Deo, Haoran Dou, Nishant Ravikumar, Alejandro F. Frangi, Toni Lassila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Circle of Willis (CoW) is the part of cerebral vasculature responsible
+for delivering blood to the brain. Understanding the diverse anatomical
+variations and configurations of the CoW is paramount to advance research on
+cerebrovascular diseases and refine clinical interventions. However,
+comprehensive investigation of less prevalent CoW variations remains
+challenging because of the dominance of a few commonly occurring
+configurations. We propose a novel generative approach utilising a conditional
+latent diffusion model with shape and anatomical guidance to generate realistic
+3D CoW segmentations, including different phenotypical variations. Our
+conditional latent diffusion model incorporates shape guidance to better
+preserve vessel continuity and demonstrates superior performance when compared
+to alternative generative models, including conditional variants of 3D GAN and
+3D VAE. We observed that our model generated CoW variants that are more
+realistic and demonstrate higher visual fidelity than competing approaches with
+an FID score 53\% better than the best-performing GAN-based model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Networks at a Fraction with Pruned Quaternions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahel Mohammad Iqbal, Subhankar Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary state-of-the-art neural networks have increasingly large numbers
+of parameters, which prevents their deployment on devices with limited
+computational power. Pruning is one technique to remove unnecessary weights and
+reduce resource requirements for training and inference. In addition, for ML
+tasks where the input data is multi-dimensional, using higher-dimensional data
+embeddings such as complex numbers or quaternions has been shown to reduce the
+parameter count while maintaining accuracy. In this work, we conduct pruning on
+real and quaternion-valued implementations of different architectures on
+classification tasks. We find that for some architectures, at very high
+sparsity levels, quaternion models provide higher accuracies than their real
+counterparts. For example, at the task of image classification on CIFAR-10
+using Conv-4, at $3\%$ of the number of parameters as the original model, the
+pruned quaternion version outperforms the pruned real by more than $10\%$.
+Experiments on various network architectures and datasets show that for
+deployment in extremely resource-constrained environments, a sparse quaternion
+network might be a better candidate than a real sparse model of similar
+architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lihe Yang, Zhen Zhao, Lei Qi, Yu Qiao, Yinghuan Shi, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning is attracting blooming attention, due to its success
+in combining unlabeled data. To mitigate potentially incorrect pseudo labels,
+recent frameworks mostly set a fixed confidence threshold to discard uncertain
+samples. This practice ensures high-quality pseudo labels, but incurs a
+relatively low utilization of the whole unlabeled set. In this work, our key
+insight is that these uncertain samples can be turned into certain ones, as
+long as the confusion classes for the top-1 class are detected and removed.
+Invoked by this, we propose a novel method dubbed ShrinkMatch to learn
+uncertain samples. For each uncertain sample, it adaptively seeks a shrunk
+class space, which merely contains the original top-1 class, as well as
+remaining less likely classes. Since the confusion ones are removed in this
+space, the re-calculated top-1 confidence can satisfy the pre-defined
+threshold. We then impose a consistency regularization between a pair of
+strongly and weakly augmented samples in the shrunk space to strive for
+discriminative representations. Furthermore, considering the varied reliability
+among uncertain samples and the gradually improved model during training, we
+correspondingly design two reweighting principles for our uncertain loss. Our
+method exhibits impressive performance on widely adopted benchmarks. Code is
+available at https://github.com/LiheYoung/ShrinkMatch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Image Denoising in Real-World Scenarios via
+  Self-Collaboration Parallel Generative Adversarial Branches <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lin, Chao Ren, Xiao Liu, Jie Huang, Yinjie Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods have shown remarkable performance in image denoising,
+particularly when trained on large-scale paired datasets. However, acquiring
+such paired datasets for real-world scenarios poses a significant challenge.
+Although unsupervised approaches based on generative adversarial networks offer
+a promising solution for denoising without paired datasets, they are difficult
+in surpassing the performance limitations of conventional GAN-based
+unsupervised frameworks without significantly modifying existing structures or
+increasing the computational complexity of denoisers. To address this problem,
+we propose a SC strategy for multiple denoisers. This strategy can achieve
+significant performance improvement without increasing the inference complexity
+of the GAN-based denoising framework. Its basic idea is to iteratively replace
+the previous less powerful denoiser in the filter-guided noise extraction
+module with the current powerful denoiser. This process generates better
+synthetic clean-noisy image pairs, leading to a more powerful denoiser for the
+next iteration. This baseline ensures the stability and effectiveness of the
+training network. The experimental results demonstrate the superiority of our
+method over state-of-the-art unsupervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Meta-Learning with Longitudinally Generalized Regularization for
+  One-Shot Brain Tissue Segmentation Across the Human Lifespan <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongheng Sun, Fan Wang, Jun Shu, Haifeng Wang, Li Wang. Deyu Meng, Chunfeng Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain tissue segmentation is essential for neuroscience and clinical studies.
+However, segmentation on longitudinal data is challenging due to dynamic brain
+changes across the lifespan. Previous researches mainly focus on
+self-supervision with regularizations and will lose longitudinal generalization
+when fine-tuning on a specific age group. In this paper, we propose a dual
+meta-learning paradigm to learn longitudinally consistent representations and
+persist when fine-tuning. Specifically, we learn a plug-and-play feature
+extractor to extract longitudinal-consistent anatomical representations by
+meta-feature learning and a well-initialized task head for fine-tuning by
+meta-initialization learning. Besides, two class-aware regularizations are
+proposed to encourage longitudinal consistency. Experimental results on the
+iSeg2019 and ADNI datasets demonstrate the effectiveness of our method. Our
+code is available at https://github.com/ladderlab-xjtu/DuMeta.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,
+  and Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongrong Cheng, Miao Zhang, Javen Qinfeng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep neural networks, particularly recent large language models, come
+with massive model sizes that require significant computational and storage
+resources. To enable the deployment of modern models on resource-constrained
+environments and accelerate inference time, researchers have increasingly
+explored pruning techniques as a popular research direction in neural network
+compression. However, there is a dearth of up-to-date comprehensive review
+papers on pruning. To address this issue, in this survey, we provide a
+comprehensive review of existing research works on deep neural network pruning
+in a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to
+prune, and 4) fusion of pruning and other compression techniques. We then
+provide a thorough comparative analysis of seven pairs of contrast settings for
+pruning (e.g., unstructured/structured) and explore emerging topics, including
+post-training pruning, different levels of supervision for pruning, and broader
+applications (e.g., adversarial robustness) to shed light on the commonalities
+and differences of existing methods and lay the foundation for further method
+development. To facilitate future research, we build a curated collection of
+datasets, networks, and evaluations on different applications. Finally, we
+provide some valuable recommendations on selecting pruning methods and prospect
+promising research directions. We build a repository at
+https://github.com/hrcheng1066/awesome-pruning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance
+  from High-Quality Isotropic Volumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijie Huang, Xukun Zhang, Zhiming Cui, He Zhang, Geng Chen, Dinggang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate tissue segmentation of thick-slice fetal brain magnetic resonance
+(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and
+the quantification of fetal brain development. However, this task is
+challenging due to the use of thick-slice scans in clinically-acquired fetal
+brain data. To address this issue, we propose to leverage high-quality
+isotropic fetal brain MR volumes (and also their corresponding annotations) as
+guidance for segmentation of thick-slice scans. Due to existence of significant
+domain gap between high-quality isotropic volume (i.e., source data) and
+thick-slice scans (i.e., target data), we employ a domain adaptation technique
+to achieve the associated knowledge transfer (from high-quality <source>
+volumes to thick-slice <target> scans). Specifically, we first register the
+available high-quality isotropic fetal brain MR volumes across different
+gestational weeks to construct longitudinally-complete source data. To capture
+domain-invariant information, we then perform Fourier decomposition to extract
+image content and style codes. Finally, we propose a novel Cycle-Consistent
+Domain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge
+learned from high-quality isotropic volumes for accurate tissue segmentation of
+thick-slice scans. Our C2DA-Net can fully utilize a small set of annotated
+isotropic volumes to guide tissue segmentation on unannotated thick-slice
+scans. Extensive experiments on a large-scale dataset of 372 clinically
+acquired thick-slice MR scans demonstrate that our C2DA-Net achieves much
+better performance than cutting-edge methods quantitatively and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation,
+  Unsupervised domain adaptation, Cycle-consistency</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Influence Function Based Second-Order Channel Pruning-Evaluating True
+  Loss Changes For Pruning Is Possible Without Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongrong Cheng, Miao Zhang, Javen Qinfeng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A challenge of channel pruning is designing efficient and effective criteria
+to select channels to prune. A widely used criterion is minimal performance
+degeneration. To accurately evaluate the truth performance degeneration
+requires retraining the survived weights to convergence, which is prohibitively
+slow. Hence existing pruning methods use previous weights (without retraining)
+to evaluate the performance degeneration. However, we observe the loss changes
+differ significantly with and without retraining. It motivates us to develop a
+technique to evaluate true loss changes without retraining, with which channels
+to prune can be selected more reliably and confidently. We first derive a
+closed-form estimator of the true loss change per pruning mask change, using
+influence functions without retraining. Influence function which is from robust
+statistics reveals the impacts of a training sample on the model's prediction
+and is repurposed by us to assess impacts on true loss changes. We then show
+how to assess the importance of all channels simultaneously and develop a novel
+global channel pruning algorithm accordingly. We conduct extensive experiments
+to verify the effectiveness of the proposed algorithm. To the best of our
+knowledge, we are the first that shows evaluating true loss changes for pruning
+without retraining is possible. This finding will open up opportunities for a
+series of new paradigms to emerge that differ from existing pruning methods.
+The code is available at https://github.com/hrcheng1066/IFSO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>chrome-extension://ogjibjphoadhljaoicdnjnmgokohngcc/assets/icon-50207e67.png</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FastLLVE: Real-Time Low-Light Video Enhancement with Intensity-Aware
+  Lookup Table 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Li, Guangyang Wu, Wenyi Wang, Peiran Ren, Xiaohong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-Light Video Enhancement (LLVE) has received considerable attention in
+recent years. One of the critical requirements of LLVE is inter-frame
+brightness consistency, which is essential for maintaining the temporal
+coherence of the enhanced video. However, most existing single-image-based
+methods fail to address this issue, resulting in flickering effect that
+degrades the overall quality after enhancement. Moreover, 3D Convolution Neural
+Network (CNN)-based methods, which are designed for video to maintain
+inter-frame consistency, are computationally expensive, making them impractical
+for real-time applications. To address these issues, we propose an efficient
+pipeline named FastLLVE that leverages the Look-Up-Table (LUT) technique to
+maintain inter-frame brightness consistency effectively. Specifically, we
+design a learnable Intensity-Aware LUT (IA-LUT) module for adaptive
+enhancement, which addresses the low-dynamic problem in low-light scenarios.
+This enables FastLLVE to perform low-latency and low-complexity enhancement
+operations while maintaining high-quality results. Experimental results on
+benchmark datasets demonstrate that our method achieves the State-Of-The-Art
+(SOTA) performance in terms of both image quality and inter-frame brightness
+consistency. More importantly, our FastLLVE can process 1,080p videos at
+$\mathit{50+}$ Frames Per Second (FPS), which is $\mathit{2 \times}$ faster
+than SOTA CNN-based methods in inference time, making it a promising solution
+for real-time applications. The code is available at
+https://github.com/Wenhao-Li-777/FastLLVE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages, 9 Figures, and 6 Tables. Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target before Shooting: Accurate Anomaly Detection and Localization
+  under One Millisecond via Cascade Patch Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxi Li, Jianfei Hu, Bo Li, Hao Chen, Yongbin Zheng, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, by re-examining the "matching" nature of Anomaly Detection
+(AD), we propose a new AD framework that simultaneously enjoys new records of
+AD accuracy and dramatically high running speed. In this framework, the anomaly
+detection problem is solved via a cascade patch retrieval procedure that
+retrieves the nearest neighbors for each test image patch in a coarse-to-fine
+fashion. Given a test sample, the top-K most similar training images are first
+selected based on a robust histogram matching process. Secondly, the nearest
+neighbor of each test patch is retrieved over the similar geometrical locations
+on those "global nearest neighbors", by using a carefully trained local metric.
+Finally, the anomaly score of each test image patch is calculated based on the
+distance to its "local nearest neighbor" and the "non-background" probability.
+The proposed method is termed "Cascade Patch Retrieval" (CPR) in this work.
+Different from the conventional patch-matching-based AD algorithms, CPR selects
+proper "targets" (reference images and locations) before "shooting"
+(patch-matching). On the well-acknowledged MVTec AD, BTAD and MVTec-3D AD
+datasets, the proposed algorithm consistently outperforms all the comparing
+SOTA methods by remarkable margins, measured by various AD metrics.
+Furthermore, CPR is extremely efficient. It runs at the speed of 113 FPS with
+the standard setting while its simplified version only requires less than 1 ms
+to process an image at the cost of a trivial accuracy drop. The code of CPR is
+available at https://github.com/flyinghu123/CPR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Noise2noise Method Utilizing Corrupted Images with a
+  Modular Network for LDCT Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuting Zhu, Qiang He, Yudong Yao, Yueyang Teng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is a very promising technique for low-dose computed tomography
+(LDCT) image denoising. However, traditional deep learning methods require
+paired noisy and clean datasets, which are often difficult to obtain. This
+paper proposes a new method for performing LDCT image denoising with only LDCT
+data, which means that normal-dose CT (NDCT) is not needed. We adopt a
+combination including the self-supervised noise2noise model and the
+noisy-as-clean strategy. First, we add a second yet similar type of noise to
+LDCT images multiple times. Note that we use LDCT images based on the
+noisy-as-clean strategy for corruption instead of NDCT images. Then, the
+noise2noise model is executed with only the secondary corrupted images for
+training. We select a modular U-Net structure from several candidates with
+shared parameters to perform the task, which increases the receptive field
+without increasing the parameter size. The experimental results obtained on the
+Mayo LDCT dataset show the effectiveness of the proposed method compared with
+that of state-of-the-art deep learning methods. The developed code is available
+at https://github.com/XYuan01/Self-supervised-Noise2Noise-for-LDCT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TextDiff: Mask-Guided Residual Diffusion Models for Scene Text Image
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baolin Liu, Zongyuan Yang, Pengfei Wang, Junjie Zhou, Ziqi Liu, Ziyi Song, Yan Liu, Yongping Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of scene text image super-resolution is to reconstruct
+high-resolution text-line images from unrecognizable low-resolution inputs. The
+existing methods relying on the optimization of pixel-level loss tend to yield
+text edges that exhibit a notable degree of blurring, thereby exerting a
+substantial impact on both the readability and recognizability of the text. To
+address these issues, we propose TextDiff, the first diffusion-based framework
+tailored for scene text image super-resolution. It contains two modules: the
+Text Enhancement Module (TEM) and the Mask-Guided Residual Diffusion Module
+(MRD). The TEM generates an initial deblurred text image and a mask that
+encodes the spatial location of the text. The MRD is responsible for
+effectively sharpening the text edge by modeling the residuals between the
+ground-truth images and the initial deblurred images. Extensive experiments
+demonstrate that our TextDiff achieves state-of-the-art (SOTA) performance on
+public benchmark datasets and can improve the readability of scene text images.
+Moreover, our proposed MRD module is plug-and-play that effectively sharpens
+the text edges produced by SOTA methods. This enhancement not only improves the
+readability and recognizability of the results generated by SOTA methods but
+also does not require any additional joint training. Available
+Codes:https://github.com/Lenubolim/TextDiff.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Free-ATM: Exploring Unsupervised Learning on Diffusion-Generated Images
+  with Free Attention Masks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Junhao Zhang, Mutian Xu, Chuhui Xue, Wenqing Zhang, Xiaoguang Han, Song Bai, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the rapid advancement of unsupervised learning in visual
+representation, it requires training on large-scale datasets that demand costly
+data collection, and pose additional challenges due to concerns regarding data
+privacy. Recently, synthetic images generated by text-to-image diffusion
+models, have shown great potential for benefiting image recognition. Although
+promising, there has been inadequate exploration dedicated to unsupervised
+learning on diffusion-generated images. To address this, we start by uncovering
+that diffusion models' cross-attention layers inherently provide
+annotation-free attention masks aligned with corresponding text inputs on
+generated images. We then investigate the problems of three prevalent
+unsupervised learning techniques ( i.e., contrastive learning, masked modeling,
+and vision-language pretraining) and introduce customized solutions by fully
+exploiting the aforementioned free attention masks. Our approach is validated
+through extensive experiments that show consistent improvements in baseline
+models across various downstream tasks, including image classification,
+detection, segmentation, and image-text retrieval. By utilizing our method, it
+is possible to close the performance gap between unsupervised pretraining on
+synthetic data and real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AerialVLN: Vision-and-Language Navigation for UAVs <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubo Liu, Hongsheng Zhang, Yuankai Qi, Peng Wang, Yaning Zhang, Qi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently emerged Vision-and-Language Navigation (VLN) tasks have drawn
+significant attention in both computer vision and natural language processing
+communities. Existing VLN tasks are built for agents that navigate on the
+ground, either indoors or outdoors. However, many tasks require intelligent
+agents to carry out in the sky, such as UAV-based goods delivery,
+traffic/security patrol, and scenery tour, to name a few. Navigating in the sky
+is more complicated than on the ground because agents need to consider the
+flying height and more complex spatial relationship reasoning. To fill this gap
+and facilitate research in this field, we propose a new task named AerialVLN,
+which is UAV-based and towards outdoor environments. We develop a 3D simulator
+rendered by near-realistic pictures of 25 city-level scenarios. Our simulator
+supports continuous navigation, environment extension and configuration. We
+also proposed an extended baseline model based on the widely-used
+cross-modal-alignment (CMA) navigation methods. We find that there is still a
+significant gap between the baseline model and human performance, which
+suggests AerialVLN is a new challenging task. Dataset and code is available at
+https://github.com/AirVLN/AirVLN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Yin, Dejia Xu, Chuangchuang Tan, Ping Liu, Yao Zhao, Yunchao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low light enhancement has gained increasing importance with the rapid
+development of visual creation and editing. However, most existing enhancement
+algorithms are designed to homogeneously increase the brightness of images to a
+pre-defined extent, limiting the user experience. To address this issue, we
+propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a
+novel diffusion framework to provide users with rich controllability. Built
+with a conditional diffusion model, we introduce an illumination embedding to
+let users control their desired brightness level. Additionally, we incorporate
+the Segment-Anything Model (SAM) to enable user-friendly region
+controllability, where users can click on objects to specify the regions they
+wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves
+competitive performance regarding quantitative metrics, qualitative results,
+and versatile controllability. Project page:
+\url{https://yuyangyin.github.io/CLEDiffusion/}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IP-Adapter: Text Compatible Image <span class="highlight-title">Prompt</span> Adapter for Text-to-Image
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the strong power of large text-to-image diffusion
+models for the impressive generative capability to create high-fidelity images.
+However, it is very tricky to generate desired images using only text prompt as
+it often involves complex prompt engineering. An alternative to text prompt is
+image prompt, as the saying goes: "an image is worth a thousand words".
+Although existing methods of direct fine-tuning from pretrained models are
+effective, they require large computing resources and are not compatible with
+other base models, text prompt, and structural controls. In this paper, we
+present IP-Adapter, an effective and lightweight adapter to achieve image
+prompt capability for the pretrained text-to-image diffusion models. The key
+design of our IP-Adapter is decoupled cross-attention mechanism that separates
+cross-attention layers for text features and image features. Despite the
+simplicity of our method, an IP-Adapter with only 22M parameters can achieve
+comparable or even better performance to a fully fine-tuned image prompt model.
+As we freeze the pretrained diffusion model, the proposed IP-Adapter can be
+generalized not only to other custom models fine-tuned from the same base
+model, but also to controllable generation using existing controllable tools.
+With the benefit of the decoupled cross-attention strategy, the image prompt
+can also work well with the text prompt to achieve multimodal image generation.
+The project page is available at \url{https://ip-adapter.github.io}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Scene Graph Prediction on Point Clouds Using Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiding Qiu, Henrik I. Christensen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D scene graph prediction is a task that aims to concurrently predict object
+classes and their relationships within a 3D environment. As these environments
+are primarily designed by and for humans, incorporating commonsense knowledge
+regarding objects and their relationships can significantly constrain and
+enhance the prediction of the scene graph. In this paper, we investigate the
+application of commonsense knowledge graphs for 3D scene graph prediction on
+point clouds of indoor scenes. Through experiments conducted on a real-world
+indoor dataset, we demonstrate that integrating external commonsense knowledge
+via the message-passing method leads to a 15.0 % improvement in scene graph
+prediction accuracy with external knowledge and $7.96\%$ with internal
+knowledge when compared to state-of-the-art algorithms. We also tested in the
+real world with 10 frames per second for scene graph generation to show the
+usage of the model in a more realistic robotics setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at CASE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StairNetV3: Depth-aware Stair Modeling using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Zhongcai Pei, Shuang Qiu, Yachun Wang, Zhiyong Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based stair perception can help autonomous mobile robots deal with the
+challenge of climbing stairs, especially in unfamiliar environments. To address
+the problem that current monocular vision methods are difficult to model stairs
+accurately without depth information, this paper proposes a depth-aware stair
+modeling method for monocular vision. Specifically, we take the extraction of
+stair geometric features and the prediction of depth images as joint tasks in a
+convolutional neural network (CNN), with the designed information propagation
+architecture, we can achieve effective supervision for stair geometric feature
+learning by depth information. In addition, to complete the stair modeling, we
+take the convex lines, concave lines, tread surfaces and riser surfaces as
+stair geometric features and apply Gaussian kernels to enable the network to
+predict contextual information within the stair lines. Combined with the depth
+information obtained by depth sensors, we propose a stair point cloud
+reconstruction method that can quickly get point clouds belonging to the stair
+step surfaces. Experiments on our dataset show that our method has a
+significant improvement over the previous best monocular vision method, with an
+intersection over union (IOU) increase of 3.4 %, and the lightweight version
+has a fast detection speed and can meet the requirements of most real-time
+applications. Our dataset is available at
+https://data.mendeley.com/datasets/6kffmjt7g2/1.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binbin Yang, Yi Luo, Ziliang Chen, Guangrun Wang, Xiaodan Liang, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to the rapid development of diffusion models, unprecedented progress
+has been witnessed in image synthesis. Prior works mostly rely on pre-trained
+linguistic models, but a text is often too abstract to properly specify all the
+spatial properties of an image, e.g., the layout configuration of a scene,
+leading to the sub-optimal results of complex scene generation. In this paper,
+we achieve accurate complex scene generation by proposing a semantically
+controllable Layout-AWare diffusion model, termed LAW-Diffusion. Distinct from
+the previous Layout-to-Image generation (L2I) methods that only explore
+category-aware relationships, LAW-Diffusion introduces a spatial dependency
+parser to encode the location-aware semantic coherence across objects as a
+layout embedding and produces a scene with perceptually harmonious object
+styles and contextual relations. To be specific, we delicately instantiate each
+object's regional semantics as an object region map and leverage a
+location-aware cross-object attention module to capture the spatial
+dependencies among those disentangled representations. We further propose an
+adaptive guidance schedule for our layout guidance to mitigate the trade-off
+between the regional semantic alignment and the texture fidelity of generated
+objects. Moreover, LAW-Diffusion allows for instance reconfiguration while
+maintaining the other regions in a synthesized image by introducing a
+layout-aware latent grafting mechanism to recompose its local regional
+semantics. To better verify the plausibility of generated scenes, we propose a
+new evaluation metric for the L2I task, dubbed Scene Relation Score (SRS) to
+measure how the images preserve the rational and harmonious relations among
+contextual objects. Comprehensive experiments demonstrate that our
+LAW-Diffusion yields the state-of-the-art generative performance, especially
+with coherent object relations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compositional Feature Augmentation for Unbiased Scene Graph Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Li, Guikun Chen, Jun Xiao, Yi Yang, Chunping Wang, Long Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) aims to detect all the visual relation triplets
+<sub, pred, obj> in a given image. With the emergence of various advanced
+techniques for better utilizing both the intrinsic and extrinsic information in
+each relation triplet, SGG has achieved great progress over the recent years.
+However, due to the ubiquitous long-tailed predicate distributions, today's SGG
+models are still easily biased to the head predicates. Currently, the most
+prevalent debiasing solutions for SGG are re-balancing methods, e.g., changing
+the distributions of original training samples. In this paper, we argue that
+all existing re-balancing strategies fail to increase the diversity of the
+relation triplet features of each predicate, which is critical for robust SGG.
+To this end, we propose a novel Compositional Feature Augmentation (CFA)
+strategy, which is the first unbiased SGG work to mitigate the bias issue from
+the perspective of increasing the diversity of triplet features. Specifically,
+we first decompose each relation triplet feature into two components: intrinsic
+feature and extrinsic feature, which correspond to the intrinsic
+characteristics and extrinsic contexts of a relation triplet, respectively.
+Then, we design two different feature augmentation modules to enrich the
+feature diversity of original relation triplets by replacing or mixing up
+either their intrinsic or extrinsic features from other samples. Due to its
+model-agnostic nature, CFA can be seamlessly incorporated into various SGG
+frameworks. Extensive ablations have shown that CFA achieves a new
+state-of-the-art performance on the trade-off between different metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Condition-Adaptive Graph Convolution Learning for Skeleton-Based Gait
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohu Huang, Xinggang Wang, Zhidianqiu Jin, Bo Yang, Botao He, Bin Feng, Wenyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks have been widely applied in skeleton-based gait
+recognition. A key challenge in this task is to distinguish the individual
+walking styles of different subjects across various views. Existing
+state-of-the-art methods employ uniform convolutions to extract features from
+diverse sequences and ignore the effects of viewpoint changes. To overcome
+these limitations, we propose a condition-adaptive graph (CAG) convolution
+network that can dynamically adapt to the specific attributes of each skeleton
+sequence and the corresponding view angle. In contrast to using fixed weights
+for all joints and sequences, we introduce a joint-specific filter learning
+(JSFL) module in the CAG method, which produces sequence-adaptive filters at
+the joint level. The adaptive filters capture fine-grained patterns that are
+unique to each joint, enabling the extraction of diverse spatial-temporal
+information about body parts. Additionally, we design a view-adaptive topology
+learning (VATL) module that generates adaptive graph topologies. These graph
+topologies are used to correlate the joints adaptively according to the
+specific view conditions. Thus, CAG can simultaneously adjust to various
+walking styles and viewpoints. Experiments on the two most widely used datasets
+(i.e., CASIA-B and OU-MVLP) show that CAG surpasses all previous skeleton-based
+methods. Moreover, the recognition performance can be enhanced by simply
+combining CAG with appearance-based methods, demonstrating the ability of CAG
+to provide useful complementary information.The source code will be available
+at https://github.com/OliverHxh/CAG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TIP journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ POSTER: A Pyramid Cross-Fusion <span class="highlight-title">Transformer</span> Network for Facial Expression
+  Recognition <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.04083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.04083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Zheng, Matias Mendieta, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition (FER) is an important task in computer vision,
+having practical applications in areas such as human-computer interaction,
+education, healthcare, and online monitoring. In this challenging FER task,
+there are three key issues especially prevalent: inter-class similarity,
+intra-class discrepancy, and scale sensitivity. While existing works typically
+address some of these issues, none have fully addressed all three challenges in
+a unified framework. In this paper, we propose a two-stream Pyramid
+crOss-fuSion TransformER network (POSTER), that aims to holistically solve all
+three issues. Specifically, we design a transformer-based cross-fusion method
+that enables effective collaboration of facial landmark features and image
+features to maximize proper attention to salient facial regions. Furthermore,
+POSTER employs a pyramid structure to promote scale invariance. Extensive
+experimental results demonstrate that our POSTER achieves new state-of-the-art
+results on RAF-DB (92.05%), FERPlus (91.62%), as well as AffectNet 7 class
+(67.31%) and 8 class (63.34%). The code is available at
+https://github.com/zczcwh/POSTER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV Workshop (AMFG) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CheckerPose: Progressive Dense Keypoint Localization for Object Pose
+  Estimation with Graph Neural Network <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyi Lian, Haibin Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the 6-DoF pose of a rigid object from a single RGB image is a
+crucial yet challenging task. Recent studies have shown the great potential of
+dense correspondence-based solutions, yet improvements are still needed to
+reach practical deployment. In this paper, we propose a novel pose estimation
+algorithm named CheckerPose, which improves on three main aspects. Firstly,
+CheckerPose densely samples 3D keypoints from the surface of the 3D object and
+finds their 2D correspondences progressively in the 2D image. Compared to
+previous solutions that conduct dense sampling in the image space, our strategy
+enables the correspondence searching in a 2D grid (i.e., pixel coordinate).
+Secondly, for our 3D-to-2D correspondence, we design a compact binary code
+representation for 2D image locations. This representation not only allows for
+progressive correspondence refinement but also converts the correspondence
+regression to a more efficient classification problem. Thirdly, we adopt a
+graph neural network to explicitly model the interactions among the sampled 3D
+keypoints, further boosting the reliability and accuracy of the
+correspondences. Together, these novel components make CheckerPose a strong
+pose estimation algorithm. When evaluated on the popular Linemod, Linemod-O,
+and YCB-V object pose estimation benchmarks, CheckerPose clearly boosts the
+accuracy of correspondence-based methods and achieves state-of-the-art
+performances. Code is available at https://github.com/RuyiLian/CheckerPose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FemtoDet: An Object Detection Baseline for Energy Versus Performance
+  Tradeoffs <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06719v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06719v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Tu, Xu Xie, Guo AI, Yuexiang Li, Yawen Huang, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient detectors for edge devices are often optimized for parameters or
+speed count metrics, which remain in weak correlation with the energy of
+detectors.
+  However, some vision applications of convolutional neural networks, such as
+always-on surveillance cameras, are critical for energy constraints.
+  This paper aims to serve as a baseline by designing detectors to reach
+tradeoffs between energy and performance from two perspectives:
+  1) We extensively analyze various CNNs to identify low-energy architectures,
+including selecting activation functions, convolutions operators, and feature
+fusion structures on necks. These underappreciated details in past work
+seriously affect the energy consumption of detectors;
+  2) To break through the dilemmatic energy-performance problem, we propose a
+balanced detector driven by energy using discovered low-energy components named
+\textit{FemtoDet}.
+  In addition to the novel construction, we improve FemtoDet by considering
+convolutions and training strategy optimizations.
+  Specifically, we develop a new instance boundary enhancement (IBE) module for
+convolution optimization to overcome the contradiction between the limited
+capacity of CNNs and detection tasks in diverse spatial representations, and
+propose a recursive warm-restart (RecWR) for optimizing training strategy to
+escape the sub-optimization of light-weight detectors by considering the data
+shift produced in popular augmentations.
+  As a result, FemtoDet with only 68.77k parameters achieves a competitive
+score of 46.3 AP50 on PASCAL VOC and 1.11 W $\&$ 64.47 FPS on Qualcomm
+Snapdragon 865 CPU platforms.
+  Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed
+method achieves competitive results in diverse scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Foiling Explanations in Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14860v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14860v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Snir Vitrack Tamam, Raz Lapid, Moshe Sipper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have greatly impacted numerous fields over the
+past decade. Yet despite exhibiting superb performance over many problems,
+their black-box nature still poses a significant challenge with respect to
+explainability. Indeed, explainable artificial intelligence (XAI) is crucial in
+several fields, wherein the answer alone -- sans a reasoning of how said answer
+was derived -- is of little value. This paper uncovers a troubling property of
+explanation methods for image-based DNNs: by making small visual changes to the
+input image -- hardly influencing the network's output -- we demonstrate how
+explanations may be arbitrarily manipulated through the use of evolution
+strategies. Our novel algorithm, AttaXAI, a model-agnostic, adversarial attack
+on XAI algorithms, only requires access to the output logits of a classifier
+and to the explanation map; these weak assumptions render our approach highly
+useful where real-world models and data are concerned. We compare our method's
+performance on two benchmark datasets -- CIFAR100 and ImageNet -- using four
+different pretrained deep-learning models: VGG16-CIFAR100, VGG16-ImageNet,
+MobileNet-CIFAR100, and Inception-v3-ImageNet. We find that the XAI methods can
+be manipulated without the use of gradients or other model internals. Our novel
+algorithm is successfully able to manipulate an image in a manner imperceptible
+to the human eye, such that the XAI method outputs a specific explanation map.
+To our knowledge, this is the first such method in a black-box setting, and we
+believe it has significant value where explainability is desired, required, or
+legally mandatory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Snir Vitrack Tamam and Raz Lapid contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.12697v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.12697v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Wu, Jiaqi Wang, Xingang Pan, Xudong Xu, Christian Theobalt, Ziwei Liu, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural surface reconstruction aims to reconstruct accurate 3D surfaces based
+on multi-view images. Previous methods based on neural volume rendering mostly
+train a fully implicit model with MLPs, which typically require hours of
+training for a single scene. Recent efforts explore the explicit volumetric
+representation to accelerate the optimization via memorizing significant
+information with learnable voxel grids. However, existing voxel-based methods
+often struggle in reconstructing fine-grained geometry, even when combined with
+an SDF-based volume rendering scheme. We reveal that this is because 1) the
+voxel grids tend to break the color-geometry dependency that facilitates
+fine-geometry learning, and 2) the under-constrained voxel grids lack spatial
+coherence and are vulnerable to local minima. In this work, we present Voxurf,
+a voxel-based surface reconstruction approach that is both efficient and
+accurate. Voxurf addresses the aforementioned issues via several key designs,
+including 1) a two-stage training procedure that attains a coherent coarse
+shape and recovers fine details successively, 2) a dual color network that
+maintains color-geometry dependency, and 3) a hierarchical geometry feature to
+encourage information propagation across voxels. Extensive experiments show
+that Voxurf achieves high efficiency and high quality at the same time. On the
+DTU benchmark, Voxurf achieves higher reconstruction quality with a 20x
+training speedup compared to previous fully implicit methods. Our code is
+available at https://github.com/wutong16/Voxurf.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 Spotlight. Our code is available at
+  https://github.com/wutong16/Voxurf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NightHazeFormer: Single Nighttime Haze Removal Using Prior Query
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09533v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09533v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Liu, Zhongsheng Yan, Sixiang Chen, Tian Ye, Wenqi Ren, Erkang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nighttime image dehazing is a challenging task due to the presence of
+multiple types of adverse degrading effects including glow, haze, blurry,
+noise, color distortion, and so on. However, most previous studies mainly focus
+on daytime image dehazing or partial degradations presented in nighttime hazy
+scenes, which may lead to unsatisfactory restoration results. In this paper, we
+propose an end-to-end transformer-based framework for nighttime haze removal,
+called NightHazeFormer. Our proposed approach consists of two stages:
+supervised pre-training and semi-supervised fine-tuning. During the
+pre-training stage, we introduce two powerful priors into the transformer
+decoder to generate the non-learnable prior queries, which guide the model to
+extract specific degradations. For the fine-tuning, we combine the generated
+pseudo ground truths with input real-world nighttime hazy images as paired
+images and feed into the synthetic domain to fine-tune the pre-trained model.
+This semi-supervised fine-tuning paradigm helps improve the generalization to
+real domain. In addition, we also propose a large-scale synthetic dataset
+called UNREAL-NH, to simulate the real-world nighttime haze scenarios
+comprehensively. Extensive experiments on several synthetic and real-world
+datasets demonstrate the superiority of our NightHazeFormer over
+state-of-the-art nighttime haze removal methods in terms of both visually and
+quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlignDet: Aligning <span class="highlight-title">Pre-train</span>ing and Fine-tuning in Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Jie Wu, Xionghui Wang, Chen Chen, Jie Qin, Xuefeng Xiao, Rui Wang, Min Zheng, Xin Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paradigm of large-scale pre-training followed by downstream fine-tuning
+has been widely employed in various object detection algorithms. In this paper,
+we reveal discrepancies in data, model, and task between the pre-training and
+fine-tuning procedure in existing practices, which implicitly limit the
+detector's performance, generalization ability, and convergence speed. To this
+end, we propose AlignDet, a unified pre-training framework that can be adapted
+to various existing detectors to alleviate the discrepancies. AlignDet
+decouples the pre-training process into two stages, i.e., image-domain and
+box-domain pre-training. The image-domain pre-training optimizes the detection
+backbone to capture holistic visual abstraction, and box-domain pre-training
+learns instance-level semantics and task-aware concepts to initialize the parts
+out of the backbone. By incorporating the self-supervised pre-trained
+backbones, we can pre-train all modules for various detectors in an
+unsupervised paradigm. As depicted in Figure 1, extensive experiments
+demonstrate that AlignDet can achieve significant improvements across diverse
+protocols, such as detection algorithm, model backbone, data setting, and
+training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by
+2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready Version on ICCV 2023. Code and Models are publicly
+  available. Project Page: https://liming-ai.github.io/AlignDet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicen Guo, Jiahang Li, Shuai Su, Yi Feng, Dacheng Zhou, Chen Chen, Denghuang Zhang, Xingyi Zhu, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is seen that there is enormous potential to leverage powerful deep
+learning methods in the emerging field of urban digital twins. It is
+particularly in the area of intelligent road inspection where there is
+currently limited research and data available. To facilitate progress in this
+field, we have developed a well-labeled road pothole dataset named Urban
+Digital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this
+dataset will enable the use of powerful deep learning methods in urban road
+inspection, providing algorithms with a more comprehensive understanding of the
+scene and maximizing their potential. Our dataset comprises 1000 images of
+potholes, captured in various scenarios with different lighting and humidity
+conditions. Our intention is to employ this dataset for object detection,
+semantic segmentation, and instance segmentation tasks. Our team has devoted
+significant effort to conducting a detailed statistical analysis, and
+benchmarking a selection of representative algorithms from recent years. We
+also provide a multi-task platform for researchers to fully exploit the
+performance of various algorithms with the support of UDTIRI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Database webpage: https://www.udtiri.com/, Kaggle webpage:
+  https://www.kaggle.com/datasets/jiahangli617/udtiri</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangcong Wang, Zhaoxi Chen, Chen Change Loy, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) significantly degrades when only a limited
+number of views are available. To complement the lack of 3D information,
+depth-based models, such as DSNeRF and MonoSDF, explicitly assume the
+availability of accurate depth maps of multiple views. They linearly scale the
+accurate depth maps as supervision to guide the predicted depth of few-shot
+NeRFs. However, accurate depth maps are difficult and expensive to capture due
+to wide-range depth distances in the wild.
+  In this work, we present a new Sparse-view NeRF (SparseNeRF) framework that
+exploits depth priors from real-world inaccurate observations. The inaccurate
+depth observations are either from pre-trained depth models or coarse depth
+maps of consumer-level depth sensors. Since coarse depth maps are not strictly
+scaled to the ground-truth depth maps, we propose a simple yet effective
+constraint, a local depth ranking method, on NeRFs such that the expected depth
+ranking of the NeRF is consistent with that of the coarse depth maps in local
+patches. To preserve the spatial continuity of the estimated depth of NeRF, we
+further propose a spatial continuity constraint to encourage the consistency of
+the expected depth continuity of NeRF with coarse depth maps. Surprisingly,
+with simple depth ranking constraints, SparseNeRF outperforms all
+state-of-the-art few-shot NeRF methods (including depth-based models) on
+standard LLFF and DTU datasets. Moreover, we collect a new dataset NVS-RGBD
+that contains real-world depth maps from Azure Kinect, ZED 2, and iPhone 13
+Pro. Extensive experiments on NVS-RGBD dataset also validate the superiority
+and generalizability of SparseNeRF. Code and dataset are available at
+https://sparsenerf.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, Project page: https://sparsenerf.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural LiDAR Fields for Novel View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Huang, Zan Gojcic, Zian Wang, Francis Williams, Yoni Kasten, Sanja Fidler, Konrad Schindler, Or Litany
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Neural Fields for LiDAR (NFL), a method to optimise a neural field
+scene representation from LiDAR measurements, with the goal of synthesizing
+realistic LiDAR scans from novel viewpoints. NFL combines the rendering power
+of neural fields with a detailed, physically motivated model of the LiDAR
+sensing process, thus enabling it to accurately reproduce key sensor behaviors
+like beam divergence, secondary returns, and ray dropping. We evaluate NFL on
+synthetic and real LiDAR scans and show that it outperforms explicit
+reconstruct-then-simulate methods as well as other NeRF-style methods on LiDAR
+novel view synthesis task. Moreover, we show that the improved realism of the
+synthesized views narrows the domain gap to real scans and translates to better
+registration and semantic segmentation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 - camera ready. Project page:
+  https://research.nvidia.com/labs/toronto-ai/nfl/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defense-Prefix for Preventing Typographic Attacks on CLIP <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroki Azuma, Yusuke Matsui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language pre-training models (VLPs) have exhibited revolutionary
+improvements in various vision-language tasks. In VLP, some adversarial attacks
+fool a model into false or absurd classifications. Previous studies addressed
+these attacks by fine-tuning the model or changing its architecture. However,
+these methods risk losing the original model's performance and are difficult to
+apply to downstream tasks. In particular, their applicability to other tasks
+has not been considered. In this study, we addressed the reduction of the
+impact of typographic attacks on CLIP without changing the model parameters. To
+achieve this, we expand the idea of ``prefix learning'' and introduce our
+simple yet effective method: Defense-Prefix (DP), which inserts the DP token
+before a class name to make words ``robust'' against typographic attacks. Our
+method can be easily applied to downstream tasks, such as object detection,
+because the proposed method is independent of the model parameters. Our method
+significantly improves the accuracy of classification tasks for typographic
+attack datasets, while maintaining the zero-shot capabilities of the model. In
+addition, we leverage our proposed method for object detection, demonstrating
+its high applicability and effectiveness. The codes and datasets are available
+at https://github.com/azuma164/Defense-Prefix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unified Matrix Factorization with Dynamic Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangde Gao, Ke Liu, Yichao Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrix factorization (MF) is a classical collaborative filtering algorithm
+for recommender systems. It decomposes the user-item interaction matrix into a
+product of low-dimensional user representation matrix and item representation
+matrix. In typical recommendation scenarios, the user-item interaction paradigm
+is usually a two-stage process and requires static clustering analysis of the
+obtained user and item representations. The above process, however, is time and
+computationally intensive, making it difficult to apply in real-time to
+e-commerce or Internet of Things environments with billions of users and
+trillions of items. To address this, we propose a unified matrix factorization
+method based on dynamic multi-view clustering (MFDMC) that employs an
+end-to-end training paradigm. Specifically, in each view, a user/item
+representation is regarded as a weighted projection of all clusters. The
+representation of each cluster is learnable, enabling the dynamic discarding of
+bad clusters. Furthermore, we employ multi-view clustering to represent
+multiple roles of users/items, effectively utilizing the representation space
+and improving the interpretability of the user/item representations for
+downstream tasks. Extensive experiments show that our proposed MFDMC achieves
+state-of-the-art performance on real-world recommendation datasets.
+Additionally, comprehensive visualization and ablation studies interpretably
+confirm that our method provides meaningful representations for downstream
+tasks of users/items.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label<span class="highlight-title">Prompt</span>: Effective <span class="highlight-title">Prompt</span>-based Learning for Relation Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Zhang, Xiaoning Song, Zhenhua Feng, Tianyang Xu, Xiaojun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, prompt-based learning has gained popularity across many natural
+language processing (NLP) tasks by reformulating them into a cloze-style format
+to better align pre-trained language models (PLMs) with downstream tasks.
+However, applying this approach to relation classification poses unique
+challenges. Specifically, associating natural language words that fill the
+masked token with semantic relation labels (\textit{e.g.}
+\textit{``org:founded\_by}'') is difficult. To address this challenge, this
+paper presents a novel prompt-based learning method, namely LabelPrompt, for
+the relation classification task. Motivated by the intuition to ``GIVE MODEL
+CHOICES!'', we first define additional tokens to represent relation labels,
+which regard these tokens as the verbaliser with semantic initialisation and
+explicitly construct them with a prompt template method. Then, to mitigate
+inconsistency between predicted relations and given entities, we implement an
+entity-aware module with contrastive learning. Last, we conduct an attention
+query strategy within the self-attention layer to differentiates prompt tokens
+and sequence tokens. Together, these strategies enhance the adaptability of
+prompt-based learning, especially when only small labelled datasets is
+available. Comprehensive experiments on benchmark datasets demonstrate the
+superiority of our method, particularly in the few-shot scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">55</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effect of Choosing Loss Function when Using T-batching for
+  Representation Learning on Dynamic Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erfan Loghmani, MohammadAmin Fazli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning methods have revolutionized machine learning on
+networks by converting discrete network structures into continuous domains.
+However, dynamic networks that evolve over time pose new challenges. To address
+this, dynamic representation learning methods have gained attention, offering
+benefits like reduced learning time and improved accuracy by utilizing temporal
+information.
+  T-batching is a valuable technique for training dynamic network models that
+reduces training time while preserving vital conditions for accurate modeling.
+However, we have identified a limitation in the training loss function used
+with t-batching. Through mathematical analysis, we propose two alternative loss
+functions that overcome these issues, resulting in enhanced training
+performance.
+  We extensively evaluate the proposed loss functions on synthetic and
+real-world dynamic networks. The results consistently demonstrate superior
+performance compared to the original loss function. Notably, in a real-world
+network characterized by diverse user interaction histories, the proposed loss
+functions achieved more than 26.9% enhancement in Mean Reciprocal Rank (MRR)
+and more than 11.8% improvement in Recall@10. These findings underscore the
+efficacy of the proposed loss functions in dynamic network modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 10 figures, 4 tables, Submitted to Information Sciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Offensive Gameplan in the National Basketball Association
+  with Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eamon Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Throughout the analytical revolution that has occurred in the NBA, the
+development of specific metrics and formulas has given teams, coaches, and
+players a new way to see the game. However - the question arises - how can we
+verify any metrics? One method would simply be eyeball approximation (trying
+out many different gameplans) and/or trial and error - an estimation-based and
+costly approach. Another approach is to try to model already existing metrics
+with a unique set of features using machine learning techniques. The key to
+this approach is that with these features that are selected, we can try to
+gauge the effectiveness of these features combined, rather than using
+individual analysis in simple metric evaluation. If we have an accurate model,
+it can particularly help us determine the specifics of gameplan execution. In
+this paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was
+found to have a correlation with different NBA playtypes using both a linear
+regression model and a neural network regression model, although ultimately, a
+neural network worked slightly better than linear regression. Using the
+accuracy of the models as a justification, the next step was to optimize the
+output of the model with test examples, which would demonstrate the combination
+of features to best achieve a highly functioning offense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Monte-Carlo Dropout Meets Multi-Exit: Optimizing Bayesian Neural
+  Networks on FPGA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxiang Fan, Hao Chen, Liam Castelli, Zhiqiang Que, He Li, Kenneth Long, Wayne Luk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Neural Networks (BayesNNs) have demonstrated their capability of
+providing calibrated prediction for safety-critical applications such as
+medical imaging and autonomous driving. However, the high algorithmic
+complexity and the poor hardware performance of BayesNNs hinder their
+deployment in real-life applications. To bridge this gap, this paper proposes a
+novel multi-exit Monte-Carlo Dropout (MCD)-based BayesNN that achieves
+well-calibrated predictions with low algorithmic complexity. To further reduce
+the barrier to adopting BayesNNs, we propose a transformation framework that
+can generate FPGA-based accelerators for multi-exit MCD-based BayesNNs. Several
+novel optimization techniques are introduced to improve hardware performance.
+Our experiments demonstrate that our auto-generated accelerator achieves higher
+energy efficiency than CPU, GPU, and other state-of-the-art hardware
+implementations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizing Topological Graph Neural Networks with Paths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Truong, Peter Chin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Graph Neural Networks (GNNs) have made significant strides in diverse
+areas, they are hindered by a theoretical constraint known as the
+1-Weisfeiler-Lehmann test. Even though latest advancements in higher-order GNNs
+can overcome this boundary, they typically center around certain graph
+components like cliques or cycles. However, our investigation goes a different
+route. We put emphasis on paths, which are inherent in every graph. We are able
+to construct a more general topological perspective and form a bridge to
+certain established theories about other topological domains. Interestingly,
+without any assumptions on graph sub-structures, our approach surpasses earlier
+techniques in this field, achieving state-of-the-art performance on several
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Ensemble Approach to Question Classification: Integrating Electra
+  <span class="highlight-title">Transformer</span>, GloVe, and LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel ensemble approach for question classification
+using state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model
+is trained and evaluated on the TREC dataset, a well-established benchmark for
+question classification tasks. The ensemble model combines the strengths of
+Electra, a transformer-based model for language understanding, GloVe, a global
+vectors for word representation, and LSTM, a recurrent neural network variant,
+providing a robust and efficient solution for question classification.
+Extensive experiments were carried out to compare the performance of the
+proposed ensemble approach with other cutting-edge models, such as BERT,
+RoBERTa, and DistilBERT. Our results demonstrate that the ensemble model
+outperforms these models across all evaluation metrics, achieving an accuracy
+of 0.8 on the test set. These findings underscore the effectiveness of the
+ensemble approach in enhancing the performance of question classification
+tasks, and invite further exploration of ensemble methods in natural language
+processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Graph Clustering with Unknown Cluster Number 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Liu, Ke Liang, Jun Xia, Xihong Yang, Sihang Zhou, Meng Liu, Xinwang Liu, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep graph clustering, which aims to group nodes into disjoint clusters by
+neural networks in an unsupervised manner, has attracted great attention in
+recent years. Although the performance has been largely improved, the excellent
+performance of the existing methods heavily relies on an accurately predefined
+cluster number, which is not always available in the real-world scenario. To
+enable the deep graph clustering algorithms to work without the guidance of the
+predefined cluster number, we propose a new deep graph clustering method termed
+Reinforcement Graph Clustering (RGC). In our proposed method, cluster number
+determination and unsupervised representation learning are unified into a
+uniform framework by the reinforcement learning mechanism. Concretely, the
+discriminative node representations are first learned with the contrastive
+pretext task. Then, to capture the clustering state accurately with both local
+and global information in the graph, both node and cluster states are
+considered. Subsequently, at each state, the qualities of different cluster
+numbers are evaluated by the quality network, and the greedy action is executed
+to determine the cluster number. In order to conduct feedback actions, the
+clustering-oriented reward function is proposed to enhance the cohesion of the
+same clusters and separate the different clusters. Extensive experiments
+demonstrate the effectiveness and efficiency of our proposed method. The source
+code of RGC is shared at https://github.com/yueliu1999/RGC and a collection
+(papers, codes and, datasets) of deep graph clustering is shared at
+https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering on Github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximate and Weighted Data Reconstruction Attack in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Wang, Yongcun Song, Enrique Zuazua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed learning paradigm that enables
+multiple clients to collaborate on building a machine learning model without
+sharing their private data. Although FL is considered privacy-preserved by
+design, recent data reconstruction attacks demonstrate that an attacker can
+recover clients' training data based on the parameters shared in FL. However,
+most existing methods fail to attack the most widely used horizontal Federated
+Averaging (FedAvg) scenario, where clients share model parameters after
+multiple local training steps. To tackle this issue, we propose an
+interpolation-based approximation method, which makes attacking FedAvg
+scenarios feasible by generating the intermediate model updates of the clients'
+local training processes. Then, we design a layer-wise weighted loss function
+to improve the data quality of reconstruction. We assign different weights to
+model updates in different layers concerning the neural network structure, with
+the weights tuned by Bayesian optimization. Finally, experimental results
+validate the superiority of our proposed approximate and weighted attack (AWA)
+method over the other state-of-the-art methods, as demonstrated by the
+substantial improvement in different evaluation metrics for image data
+reconstructions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoK: Realistic Adversarial Attacks and Defenses for Intelligent Network
+  Intrusion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Vitorino, Isabel Praça, Eva Maia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) can be incredibly valuable to automate anomaly
+detection and cyber-attack classification, improving the way that Network
+Intrusion Detection (NID) is performed. However, despite the benefits of ML
+models, they are highly susceptible to adversarial cyber-attack examples
+specifically crafted to exploit them. A wide range of adversarial attacks have
+been created and researchers have worked on various defense strategies to
+safeguard ML models, but most were not intended for the specific constraints of
+a communication network and its communication protocols, so they may lead to
+unrealistic examples in the NID domain. This Systematization of Knowledge (SoK)
+consolidates and summarizes the state-of-the-art adversarial learning
+approaches that can generate realistic examples and could be used in real ML
+development and deployment scenarios with real network traffic flows. This SoK
+also describes the open challenges regarding the use of adversarial ML in the
+NID domain, defines the fundamental properties that are required for an
+adversarial example to be realistic, and provides guidelines for researchers to
+ensure that their future experiments are adequate for a real communication
+network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 3 tables, 6 figures, Computers and Security journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAILOR: Structural Augmentation Based Tail Node Representation Learning <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06801v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06801v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Liao, Jintang Li, Liang Chen, Bingzhe Wu, Yatao Bian, Zibin Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have achieved state-of-the-art performance in
+representation learning for graphs recently. However, the effectiveness of
+GNNs, which capitalize on the key operation of message propagation, highly
+depends on the quality of the topology structure. Most of the graphs in
+real-world scenarios follow a long-tailed distribution on their node degrees,
+that is, a vast majority of the nodes in the graph are tail nodes with only a
+few connected edges. GNNs produce inferior node representations for tail nodes
+since they lack structural information. In the pursuit of promoting the
+expressiveness of GNNs for tail nodes, we explore how the deficiency of
+structural information deteriorates the performance of tail nodes and propose a
+general Structural Augmentation based taIL nOde Representation learning
+framework, dubbed as SAILOR, which can jointly learn to augment the graph
+structure and extract more informative representations for tail nodes.
+Extensive experiments on public benchmark datasets demonstrate that SAILOR can
+significantly improve the tail node representations and outperform the
+state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023; Code is available at
+  https://github.com/Jie-Re/SAILO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faithful to Whom? Questioning Interpretability Measures in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evan Crothers, Herna Viktor, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to quantifying model interpretability is to calculate
+faithfulness metrics based on iteratively masking input tokens and measuring
+how much the predicted label changes as a result. However, we show that such
+metrics are generally not suitable for comparing the interpretability of
+different neural text classifiers as the response to masked inputs is highly
+model-specific. We demonstrate that iterative masking can produce large
+variation in faithfulness scores between comparable models, and show that
+masked samples are frequently outside the distribution seen during training. We
+further investigate the impact of adversarial attacks and adversarial training
+on faithfulness scores, and demonstrate the relevance of faithfulness measures
+for analyzing feature salience in text adversarial attacks. Our findings
+provide new insights into the limitations of current faithfulness metrics and
+key considerations to utilize them appropriately.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Networks at a Fraction with Pruned Quaternions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahel Mohammad Iqbal, Subhankar Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary state-of-the-art neural networks have increasingly large numbers
+of parameters, which prevents their deployment on devices with limited
+computational power. Pruning is one technique to remove unnecessary weights and
+reduce resource requirements for training and inference. In addition, for ML
+tasks where the input data is multi-dimensional, using higher-dimensional data
+embeddings such as complex numbers or quaternions has been shown to reduce the
+parameter count while maintaining accuracy. In this work, we conduct pruning on
+real and quaternion-valued implementations of different architectures on
+classification tasks. We find that for some architectures, at very high
+sparsity levels, quaternion models provide higher accuracies than their real
+counterparts. For example, at the task of image classification on CIFAR-10
+using Conv-4, at $3\%$ of the number of parameters as the original model, the
+pruned quaternion version outperforms the pruned real by more than $10\%$.
+Experiments on various network architectures and datasets show that for
+deployment in extremely resource-constrained environments, a sparse quaternion
+network might be a better candidate than a real sparse model of similar
+architecture.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,
+  and Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongrong Cheng, Miao Zhang, Javen Qinfeng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep neural networks, particularly recent large language models, come
+with massive model sizes that require significant computational and storage
+resources. To enable the deployment of modern models on resource-constrained
+environments and accelerate inference time, researchers have increasingly
+explored pruning techniques as a popular research direction in neural network
+compression. However, there is a dearth of up-to-date comprehensive review
+papers on pruning. To address this issue, in this survey, we provide a
+comprehensive review of existing research works on deep neural network pruning
+in a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to
+prune, and 4) fusion of pruning and other compression techniques. We then
+provide a thorough comparative analysis of seven pairs of contrast settings for
+pruning (e.g., unstructured/structured) and explore emerging topics, including
+post-training pruning, different levels of supervision for pruning, and broader
+applications (e.g., adversarial robustness) to shed light on the commonalities
+and differences of existing methods and lay the foundation for further method
+development. To facilitate future research, we build a curated collection of
+datasets, networks, and evaluations on different applications. Finally, we
+provide some valuable recommendations on selecting pruning methods and prospect
+promising research directions. We build a repository at
+https://github.com/hrcheng1066/awesome-pruning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot Class-incremental Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghua Zhang, Li Liu, Olli Silven, Matti Pietikäinen, Dewen Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in
+machine learning, as it necessitates the continuous learning of new classes
+from sparse labeled training samples without forgetting previous knowledge.
+While this field has seen recent progress, it remains an active area of
+exploration. This paper aims to provide a comprehensive and systematic review
+of FSCIL. In our in-depth examination, we delve into various facets of FSCIL,
+encompassing the problem definition, the discussion of primary challenges of
+unreliable empirical risk minimization and the stability-plasticity dilemma,
+general schemes, and relevant problems of incremental learning and few-shot
+learning. Besides, we offer an overview of benchmark datasets and evaluation
+metrics. Furthermore, we introduce the classification methods in FSCIL from
+data-based, structure-based, and optimization-based approaches and the object
+detection methods in FSCIL from anchor-free and anchor-based approaches. Beyond
+these, we illuminate several promising research directions within FSCIL that
+merit further investigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering the Symptom Patterns of COVID-19 from Recovered and Deceased
+  Patients Using Apriori Association Rule Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Dehghani, Zahra Yazdanparast, Mobin Mohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has a devastating impact globally, claiming millions of
+lives and causing significant social and economic disruptions. In order to
+optimize decision-making and allocate limited resources, it is essential to
+identify COVID-19 symptoms and determine the severity of each case. Machine
+learning algorithms offer a potent tool in the medical field, particularly in
+mining clinical datasets for useful information and guiding scientific
+decisions. Association rule mining is a machine learning technique for
+extracting hidden patterns from data. This paper presents an application of
+association rule mining based Apriori algorithm to discover symptom patterns
+from COVID-19 patients. The study, using 2875 records of patient, identified
+the most common symptoms as apnea (72%), cough (64%), fever (59%), weakness
+(18%), myalgia (14.5%), and sore throat (12%). The proposed method provides
+clinicians with valuable insight into disease that can assist them in managing
+and treating it effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Multi-Agent Reinforcement Learning via Mirror Descent
+  Policy Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mehdi Nasiri, Mansoor Rezghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an extension of the Mirror Descent method to overcome
+challenges in cooperative Multi-Agent Reinforcement Learning (MARL) settings,
+where agents have varying abilities and individual policies. The proposed
+Heterogeneous-Agent Mirror Descent Policy Optimization (HAMDPO) algorithm
+utilizes the multi-agent advantage decomposition lemma to enable efficient
+policy updates for each agent while ensuring overall performance improvements.
+By iteratively updating agent policies through an approximate solution of the
+trust-region problem, HAMDPO guarantees stability and improves performance.
+Moreover, the HAMDPO algorithm is capable of handling both continuous and
+discrete action spaces for heterogeneous agents in various MARL problems. We
+evaluate HAMDPO on Multi-Agent MuJoCo and StarCraftII tasks, demonstrating its
+superiority over state-of-the-art algorithms such as HATRPO and HAPPO. These
+results suggest that HAMDPO is a promising approach for solving cooperative
+MARL problems and could potentially be extended to address other challenging
+problems in the field of MARL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weighted Sparse Partial Least Squares for Joint Sample and Feature
+  Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenwen Min, Taosheng Xu, Chris Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse Partial Least Squares (sPLS) is a common dimensionality reduction
+technique for data fusion, which projects data samples from two views by
+seeking linear combinations with a small number of variables with the maximum
+variance. However, sPLS extracts the combinations between two data sets with
+all data samples so that it cannot detect latent subsets of samples. To extend
+the application of sPLS by identifying a specific subset of samples and remove
+outliers, we propose an $\ell_\infty/\ell_0$-norm constrained weighted sparse
+PLS ($\ell_\infty/\ell_0$-wsPLS) method for joint sample and feature selection,
+where the $\ell_\infty/\ell_0$-norm constrains are used to select a subset of
+samples. We prove that the $\ell_\infty/\ell_0$-norm constrains have the
+Kurdyka-\L{ojasiewicz}~property so that a globally convergent algorithm is
+developed to solve it. Moreover, multi-view data with a same set of samples can
+be available in various real problems. To this end, we extend the
+$\ell_\infty/\ell_0$-wsPLS model and propose two multi-view wsPLS models for
+multi-view data fusion. We develop an efficient iterative algorithm for each
+multi-view wsPLS model and show its convergence property. As well as numerical
+and biomedical data experiments demonstrate the efficiency of the proposed
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Imputation for Time-series Classification with Missing
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        SeungHyun Kim, Hyunsu Kim, EungGu Yun, Hwangrae Lee, Jaehun Lee, Juho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series data for real-world applications typically contain a
+significant amount of missing values. The dominant approach for classification
+with such missing values is to impute them heuristically with specific values
+(zero, mean, values of adjacent time-steps) or learnable parameters. However,
+these simple strategies do not take the data generative process into account,
+and more importantly, do not effectively capture the uncertainty in prediction
+due to the multiple possibilities for the missing values. In this paper, we
+propose a novel probabilistic framework for classification with multivariate
+time series data with missing values. Our model consists of two parts; a deep
+generative model for missing value imputation and a classifier. Extending the
+existing deep generative models to better capture structures of time-series
+data, our deep generative model part is trained to impute the missing values in
+multiple plausible ways, effectively modeling the uncertainty of the
+imputation. The classifier part takes the time series data along with the
+imputed missing values and classifies signals, and is trained to capture the
+predictive uncertainty due to the multiple possibilities of imputations.
+Importantly, we show that na\"ively combining the generative model and the
+classifier could result in trivial solutions where the generative model does
+not produce meaningful imputations. To resolve this, we present a novel
+regularization technique that can promote the model to produce useful
+imputation values that help classification. Through extensive experiments on
+real-world time series data with missing values, we demonstrate the
+effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Precipitation nowcasting with generative diffusion models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Asperti, Fabio Merizzi, Alberto Paparella, Giorgio Pedrazzi, Matteo Angelinelli, Stefano Colamonaco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years traditional numerical methods for accurate weather prediction
+have been increasingly challenged by deep learning methods. Numerous historical
+datasets used for short and medium-range weather forecasts are typically
+organized into a regular spatial grid structure. This arrangement closely
+resembles images: each weather variable can be visualized as a map or, when
+considering the temporal axis, as a video. Several classes of generative
+models, comprising Generative Adversarial Networks, Variational Autoencoders,
+or the recent Denoising Diffusion Models have largely proved their
+applicability to the next-frame prediction problem, and is thus natural to test
+their performance on the weather prediction benchmarks. Diffusion models are
+particularly appealing in this context, due to the intrinsically probabilistic
+nature of weather forecasting: what we are really interested to model is the
+probability distribution of weather indicators, whose expected value is the
+most likely prediction.
+  In our study, we focus on a specific subset of the ERA-5 dataset, which
+includes hourly data pertaining to Central Europe from the years 2016 to 2021.
+Within this context, we examine the efficacy of diffusion models in handling
+the task of precipitation nowcasting. Our work is conducted in comparison to
+the performance of well-established U-Net models, as documented in the existing
+literature. Our proposed approach of Generative Ensemble Diffusion (GED)
+utilizes a diffusion model to generate a set of possible weather scenarios
+which are then amalgamated into a probable prediction via the use of a
+post-processing network. This approach, in comparison to recent deep learning
+models, substantially outperformed them in terms of overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Independent Noise Condition for Estimating Causal Structure
+  with Latent Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Xie, Biwei Huang, Zhengming Chen, Ruichu Cai, Clark Glymour, Zhi Geng, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the challenging task of learning causal structure in the
+presence of latent variables, including locating latent variables and
+determining their quantity, and identifying causal relationships among both
+latent and observed variables. To address this, we propose a Generalized
+Independent Noise (GIN) condition for linear non-Gaussian acyclic causal models
+that incorporate latent variables, which establishes the independence between a
+linear combination of certain measured variables and some other measured
+variables. Specifically, for two observed random vectors $\bf{Y}$ and $\bf{Z}$,
+GIN holds if and only if $\omega^{\intercal}\mathbf{Y}$ and $\mathbf{Z}$ are
+independent, where $\omega$ is a non-zero parameter vector determined by the
+cross-covariance between $\mathbf{Y}$ and $\mathbf{Z}$. We then give necessary
+and sufficient graphical criteria of the GIN condition in linear non-Gaussian
+acyclic causal models. Roughly speaking, GIN implies the existence of an
+exogenous set $\mathcal{S}$ relative to the parent set of $\mathbf{Y}$ (w.r.t.
+the causal ordering), such that $\mathcal{S}$ d-separates $\mathbf{Y}$ from
+$\mathbf{Z}$. Interestingly, we find that the independent noise condition
+(i.e., if there is no confounder, causes are independent of the residual
+derived from regressing the effect on the causes) can be seen as a special case
+of GIN. With such a connection between GIN and latent causal structures, we
+further leverage the proposed GIN condition, together with a well-designed
+search procedure, to efficiently estimate Linear, Non-Gaussian Latent
+Hierarchical Models (LiNGLaHs), where latent confounders may also be causally
+related and may even follow a hierarchical structure. We show that the
+underlying causal structure of a LiNGLaH is identifiable in light of GIN
+conditions under mild assumptions. Experimental results show the effectiveness
+of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating and Incentivizing Imperfect-Knowledge Agents with Hidden
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilgin Dogan, Zuo-Jun Max Shen, Anil Aswani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practice, incentive providers (i.e., principals) often cannot observe the
+reward realizations of incentivized agents, which is in contrast to many
+principal-agent models that have been previously studied. This information
+asymmetry challenges the principal to consistently estimate the agent's unknown
+rewards by solely watching the agent's decisions, which becomes even more
+challenging when the agent has to learn its own rewards. This complex setting
+is observed in various real-life scenarios ranging from renewable energy
+storage contracts to personalized healthcare incentives. Hence, it offers not
+only interesting theoretical questions but also wide practical relevance. This
+paper explores a repeated adverse selection game between a self-interested
+learning agent and a learning principal. The agent tackles a multi-armed bandit
+(MAB) problem to maximize their expected reward plus incentive. On top of the
+agent's learning, the principal trains a parallel algorithm and faces a
+trade-off between consistently estimating the agent's unknown rewards and
+maximizing their own utility by offering adaptive incentives to lead the agent.
+For a non-parametric model, we introduce an estimator whose only input is the
+history of principal's incentives and agent's choices. We unite this estimator
+with a proposed data-driven incentive policy within a MAB framework. Without
+restricting the type of the agent's algorithm, we prove finite-sample
+consistency of the estimator and a rigorous regret bound for the principal by
+considering the sequential externality imposed by the agent. Lastly, our
+theoretical results are reinforced by simulations justifying applicability of
+our framework to green energy aggregator contracts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>72 pages, 6 figures. arXiv admin note: text overlap with
+  arXiv:2304.07407</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning on Graphs with Out-of-Distribution Nodes <span class="chip">KDD'22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Song, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are state-of-the-art models for performing
+prediction tasks on graphs. While existing GNNs have shown great performance on
+various tasks related to graphs, little attention has been paid to the scenario
+where out-of-distribution (OOD) nodes exist in the graph during training and
+inference. Borrowing the concept from CV and NLP, we define OOD nodes as nodes
+with labels unseen from the training set. Since a lot of networks are
+automatically constructed by programs, real-world graphs are often noisy and
+may contain nodes from unknown distributions. In this work, we define the
+problem of graph learning with out-of-distribution nodes. Specifically, we aim
+to accomplish two tasks: 1) detect nodes which do not belong to the known
+distribution and 2) classify the remaining nodes to be one of the known
+classes. We demonstrate that the connection patterns in graphs are informative
+for outlier detection, and propose Out-of-Distribution Graph Attention Network
+(OODGAT), a novel GNN model which explicitly models the interaction between
+different kinds of nodes and separate inliers from outliers during feature
+propagation. Extensive experiments show that OODGAT outperforms existing
+outlier detection methods by a large margin, while being better or comparable
+in terms of in-distribution classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD'22</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Hard-Constraint PINNs for Interface Optimal Control Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming-Chih Lai, Yongcun Song, Xiaoming Yuan, Hangrui Yue, Tianyou Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the physics-informed neural networks (PINNs), in combination
+with some recently developed discontinuity capturing neural networks, can be
+applied to solve optimal control problems subject to partial differential
+equations (PDEs) with interfaces and some control constraints. The resulting
+algorithm is mesh-free and scalable to different PDEs, and it ensures the
+control constraints rigorously. Since the boundary and interface conditions, as
+well as the PDEs, are all treated as soft constraints by lumping them into a
+weighted loss function, it is necessary to learn them simultaneously and there
+is no guarantee that the boundary and interface conditions can be satisfied
+exactly. This immediately causes difficulties in tuning the weights in the
+corresponding loss function and training the neural networks. To tackle these
+difficulties and guarantee the numerical accuracy, we propose to impose the
+boundary and interface conditions as hard constraints in PINNs by developing a
+novel neural network architecture. The resulting hard-constraint PINNs approach
+guarantees that both the boundary and interface conditions can be satisfied
+exactly and they are decoupled from the learning of the PDEs. Its efficiency is
+promisingly validated by some elliptic and parabolic interface optimal control
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating observation guided ensembles for data assimilation with
+  denoising diffusion probabilistic model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuuichi Asahi, Yuta Hasegawa, Naoyuki Onodera, Takashi Shimokawabe, Hayato Shiba, Yasuhiro Idomura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an ensemble data assimilation method using the pseudo
+ensembles generated by denoising diffusion probabilistic model. Since the model
+is trained against noisy and sparse observation data, this model can produce
+divergent ensembles close to observations. Thanks to the variance in generated
+ensembles, our proposed method displays better performance than the
+well-established ensemble data assimilation method when the simulation model is
+imperfect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the robustness difference between stochastic gradient
+  descent and adaptive gradient methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avery Ma, Yangchen Pan, Amir-massoud Farahmand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam
+and RMSProp, have been widely used in training deep neural networks. We
+empirically show that while the difference between the standard generalization
+performance of models trained using these methods is small, those trained using
+SGD exhibit far greater robustness under input perturbations. Notably, our
+investigation demonstrates the presence of irrelevant frequencies in natural
+datasets, where alterations do not affect models' generalization performance.
+However, models trained with adaptive methods show sensitivity to these
+changes, suggesting that their use of irrelevant frequencies can lead to
+solutions sensitive to perturbations. To better understand this difference, we
+study the learning dynamics of gradient descent (GD) and sign gradient descent
+(signGD) on a synthetic dataset that mirrors natural signals. With a
+three-dimensional input space, the models optimized with GD and signGD have
+standard risks close to zero but vary in their adversarial risks. Our result
+shows that linear models' robustness to $\ell_2$-norm bounded changes is
+inversely proportional to the model parameters' weight norm: a smaller weight
+norm implies better robustness. In the context of deep learning, our
+experiments show that SGD-trained neural networks show smaller Lipschitz
+constants, explaining the better robustness to input perturbations than those
+trained with adaptive gradient methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Camouflaged Image Synthesis Is All You Need to Boost Camouflaged
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haichao Zhang, Can Qin, Yu Yin, Yun Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged objects that blend into natural scenes pose significant
+challenges for deep-learning models to detect and synthesize. While camouflaged
+object detection is a crucial task in computer vision with diverse real-world
+applications, this research topic has been constrained by limited data
+availability. We propose a framework for synthesizing camouflage data to
+enhance the detection of camouflaged objects in natural scenes. Our approach
+employs a generative model to produce realistic camouflage images, which can be
+used to train existing object detection models. Specifically, we use a
+camouflage environment generator supervised by a camouflage distribution
+classifier to synthesize the camouflage images, which are then fed into our
+generator to expand the dataset. Our framework outperforms the current
+state-of-the-art method on three datasets (COD10k, CAMO, and CHAMELEON),
+demonstrating its effectiveness in improving camouflaged object detection. This
+approach can serve as a plug-and-play data generation and augmentation module
+for existing camouflaged object detection tasks and provides a novel way to
+introduce more diversity and distributions into current camouflage datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimMatchV2: Semi-Supervised Learning with Graph Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkai Zheng, Shan You, Lang Huang, Chen Luo, Fei Wang, Chen Qian, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-Supervised image classification is one of the most fundamental problem
+in computer vision, which significantly reduces the need for human labor. In
+this paper, we introduce a new semi-supervised learning algorithm - SimMatchV2,
+which formulates various consistency regularizations between labeled and
+unlabeled data from the graph perspective. In SimMatchV2, we regard the
+augmented view of a sample as a node, which consists of a label and its
+corresponding representation. Different nodes are connected with the edges,
+which are measured by the similarity of the node representations. Inspired by
+the message passing and node classification in graph theory, we propose four
+types of consistencies, namely 1) node-node consistency, 2) node-edge
+consistency, 3) edge-edge consistency, and 4) edge-node consistency. We also
+uncover that a simple feature normalization can reduce the gaps of the feature
+norm between different augmented views, significantly improving the performance
+of SimMatchV2. Our SimMatchV2 has been validated on multiple semi-supervised
+learning benchmarks. Notably, with ResNet-50 as our backbone and 300 epochs of
+training, SimMatchV2 achieves 71.9\% and 76.2\% Top-1 Accuracy with 1\% and
+10\% labeled examples on ImageNet, which significantly outperforms the previous
+methods and achieves state-of-the-art performance. Code and pre-trained models
+are available at
+\href{https://github.com/mingkai-zheng/SimMatchV2}{https://github.com/mingkai-zheng/SimMatchV2}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MDB: Interactively Querying <span class="highlight-title">Dataset</span>s and Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaditya Naik, Adam Stein, Yinjun Wu, Eric Wong, Mayur Naik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As models are trained and deployed, developers need to be able to
+systematically debug errors that emerge in the machine learning pipeline. We
+present MDB, a debugging framework for interactively querying datasets and
+models. MDB integrates functional programming with relational algebra to build
+expressive queries over a database of datasets and model predictions. Queries
+are reusable and easily modified, enabling debuggers to rapidly iterate and
+refine queries to discover and characterize errors and model behaviors. We
+evaluate MDB on object detection, bias discovery, image classification, and
+data imputation tasks across self-driving videos, large language models, and
+medical records. Our experiments show that MDB enables up to 10x faster and
+40\% shorter queries than other baselines. In a user study, we find developers
+can successfully construct complex queries that describe errors of machine
+learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of functional neural codes of deep learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jung Hoon Lee, Sujith Vijayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs), the agents of deep learning (DL), require a
+massive number of parallel/sequential operations. This makes it difficult to
+comprehend DNNs' operations and impedes proper diagnosis. Without better
+knowledge of their internal process, deploying DNNs in high-stakes domains can
+lead to catastrophic failures. Therefore, to build more reliable DNNs/DL to be
+deployed in high-stakes real-world problems, it is imperative that we gain
+insights into DNNs' internal operations underlying their decision-making. Here,
+we use the self-organizing map (SOM) to analyze DL models' internal codes
+associated with DNNs' decision-making. Our analyses suggest that shallow layers
+close to the input layer compress features into condensed space and that deep
+layers close to the output layer expand feature space. We also found evidence
+indicating that compressed features may underlie DNNs' vulnerabilities to
+adversarial perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 main figures, 3 supplemental figures, 3 supplemental
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Constitutes Good Contrastive Learning in Time-Series Forecasting? <span class="chip">IJCAI'22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiyu Zhang, Qi Yan, Lili Meng, Tristan Sylvain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the introduction of self-supervised contrastive learning
+(SSCL) has demonstrated remarkable improvements in representation learning
+across various domains, including natural language processing and computer
+vision. By leveraging the inherent benefits of self-supervision, SSCL enables
+the pre-training of representation models using vast amounts of unlabeled data.
+Despite these advances, there remains a significant gap in understanding the
+impact of different SSCL strategies on time series forecasting performance, as
+well as the specific benefits that SSCL can bring. This paper aims to address
+these gaps by conducting a comprehensive analysis of the effectiveness of
+various training variables, including different SSCL algorithms, learning
+strategies, model architectures, and their interplay. Additionally, to gain
+deeper insights into the improvements brought about by SSCL in the context of
+time-series forecasting, a qualitative analysis of the empirical receptive
+field is performed. Through our experiments, we demonstrate that the end-to-end
+training of a Transformer model using the Mean Squared Error (MSE) loss and
+SSCL emerges as the most effective approach in time series forecasting.
+Notably, the incorporation of the contrastive objective enables the model to
+prioritize more pertinent information for forecasting, such as scale and
+periodic relationships. These findings contribute to a better understanding of
+the benefits of SSCL in time series forecasting and provide valuable insights
+for future research in this area. Our codes are available at
+https://github.com/chiyuzhang94/contrastive_learning_time-series_e2e.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCAI'22 Workshop-AI4TS: AI for Time Series Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hard-Constrained Deep Learning for Climate Downscaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05424v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05424v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paula Harder, Alex Hernandez-Garcia, Venkatesh Ramesh, Qidong Yang, Prasanna Sattigeri, Daniela Szwarcman, Campbell Watson, David Rolnick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of reliable, high-resolution climate and weather data is
+important to inform long-term decisions on climate adaptation and mitigation
+and to guide rapid responses to extreme events. Forecasting models are limited
+by computational costs and, therefore, often generate coarse-resolution
+predictions. Statistical downscaling, including super-resolution methods from
+deep learning, can provide an efficient method of upsampling low-resolution
+data. However, despite achieving visually compelling results in some cases,
+such models frequently violate conservation laws when predicting physical
+variables. In order to conserve physical quantities, here we introduce methods
+that guarantee statistical constraints are satisfied by a deep learning
+downscaling model while also improving their performance according to
+traditional metrics. We compare different constraining approaches and
+demonstrate their applicability across different neural architectures as well
+as a variety of climate and weather datasets. Besides enabling faster and more
+accurate climate predictions through downscaling, we also show that our novel
+methodologies can improve super-resolution for satellite data and standard
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaRT: Certified Safety and Robust Tracking in Learning-based Motion
+  Planning for Multi-Agent Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08602v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08602v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroyasu Tsukamoto, Benjamin Rivière, Changrak Choi, Amir Rahmani, Soon-Jo Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The key innovation of our analytical method, CaRT, lies in establishing a new
+hierarchical, distributed architecture to guarantee the safety and robustness
+of a given learning-based motion planning policy. First, in a nominal setting,
+the analytical form of our CaRT safety filter formally ensures safe maneuvers
+of nonlinear multi-agent systems, optimally with minimal deviation from the
+learning-based policy. Second, in off-nominal settings, the analytical form of
+our CaRT robust filter optimally tracks the certified safe trajectory,
+generated by the previous layer in the hierarchy, the CaRT safety filter. We
+show using contraction theory that CaRT guarantees safety and the exponential
+boundedness of the trajectory tracking error, even under the presence of
+deterministic and stochastic disturbance. Also, the hierarchical nature of CaRT
+enables enhancing its robustness for safety just by its superior tracking to
+the certified safe trajectory, thereby making it suitable for off-nominal
+scenarios with large disturbances. This is a major distinction from
+conventional safety function-driven approaches, where the robustness originates
+from the stability of a safe set, which could pull the system
+over-conservatively to the interior of the safe set. Our log-barrier
+formulation in CaRT allows for its distributed implementation in multi-agent
+settings. We demonstrate the effectiveness of CaRT in several examples of
+nonlinear motion planning and control problems, including optimal,
+multi-spacecraft reconfiguration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Conference on Decision and Control (CDC), Preprint Version,
+  Accepted July, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotation-equivariant Graph Neural Networks for Learning Glassy Liquids
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Saverio Pezzicoli, Guillaume Charpiat, François P. Landes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the glassy liquids community, the use of Machine Learning (ML) to
+model particles' static structure is currently a hot topic. The state of the
+art consists in Graph Neural Networks (GNNs), which have a great expressive
+power but are heavy models with numerous parameters and lack interpretability.
+Inspired by recent advances in the field of Machine Learning group-equivariant
+representations, we build a GNN that learns a robust representation of the
+glass' static structure by constraining it to preserve the roto-translation
+(SE(3)) equivariance. We show that this constraint not only significantly
+improves the predictive power but also improves the ability to generalize to
+unseen temperatures while allowing to reduce the number of parameters.
+Furthermore, interpretability is improved, as we can relate the action of our
+basic convolution layer to well-known rotation-invariant expert features.
+Through transfer-learning experiments we demonstrate that our network learns a
+robust representation, which allows us to push forward the idea of a learned
+glass structural order parameter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures plus references and appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models can Implement Policy Iteration <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03821v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03821v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethan Brooks, Logan Walls, Richard L. Lewis, Satinder Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents In-Context Policy Iteration, an algorithm for performing
+Reinforcement Learning (RL), in-context, using foundation models. While the
+application of foundation models to RL has received considerable attention,
+most approaches rely on either (1) the curation of expert demonstrations
+(either through manual design or task-specific pretraining) or (2) adaptation
+to the task of interest using gradient methods (either fine-tuning or training
+of adapter layers). Both of these techniques have drawbacks. Collecting
+demonstrations is labor-intensive, and algorithms that rely on them do not
+outperform the experts from which the demonstrations were derived. All gradient
+techniques are inherently slow, sacrificing the "few-shot" quality that made
+in-context learning attractive to begin with. In this work, we present an
+algorithm, ICPI, that learns to perform RL tasks without expert demonstrations
+or gradients. Instead we present a policy-iteration method in which the prompt
+content is the entire locus of learning. ICPI iteratively updates the contents
+of the prompt from which it derives its policy through trial-and-error
+interaction with an RL environment. In order to eliminate the role of
+in-weights learning (on which approaches like Decision Transformer rely
+heavily), we demonstrate our algorithm using Codex, a language model with no
+prior knowledge of the domains on which we evaluate it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, submitted to ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Primal-Dual Algorithm for Hybrid Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.08106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.08106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Overman, Garrett Blum, Diego Klabjan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Very few methods for hybrid federated learning, where clients only hold
+subsets of both features and samples, exist. Yet, this scenario is very
+important in practical settings. We provide a fast, robust algorithm for hybrid
+federated learning that hinges on Fenchel Duality. We prove the convergence of
+the algorithm to the same solution as if the model was trained centrally in a
+variety of practical regimes. Furthermore, we provide experimental results that
+demonstrate the performance improvements of the algorithm over a commonly used
+method in federated learning, FedAvg. We also provide privacy considerations
+and necessary steps to protect client data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Power of Gradual Network Alignment Using Dual-Perception
+  Similarities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.10945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.10945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin-Duk Park, Cong Tran, Won-Yong Shin, Xin Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network alignment (NA) is the task of finding the correspondence of nodes
+between two networks based on the network structure and node attributes. Our
+study is motivated by the fact that, since most of existing NA methods have
+attempted to discover all node pairs at once, they do not harness information
+enriched through interim discovery of node correspondences to more accurately
+find the next correspondences during the node matching. To tackle this
+challenge, we propose Grad-Align, a new NA method that gradually discovers node
+pairs by making full use of node pairs exhibiting strong consistency, which are
+easy to be discovered in the early stage of gradual matching. Specifically,
+Grad-Align first generates node embeddings of the two networks based on graph
+neural networks along with our layer-wise reconstruction loss, a loss built
+upon capturing the first-order and higher-order neighborhood structures. Then,
+nodes are gradually aligned by computing dual-perception similarity measures
+including the multi-layer embedding similarity as well as the Tversky
+similarity, an asymmetric set similarity using the Tversky index applicable to
+networks with different scales. Additionally, we incorporate an edge
+augmentation module into Grad-Align to reinforce the structural consistency.
+Through comprehensive experiments using real-world and synthetic datasets, we
+empirically demonstrate that Grad-Align consistently outperforms
+state-of-the-art NA methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures, 4 tables; 13 pages, to appear in the IEEE
+  Transactions on Pattern Analysis and Machine Intelligence (Please cite our
+  journal version that will appear in an upcoming issue.)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized SGD and Average-direction SAM are Asymptotically
+  Equivalent <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02913v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02913v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongtian Zhu, Fengxiang He, Kaixuan Chen, Mingli Song, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized stochastic gradient descent (D-SGD) allows collaborative
+learning on massive devices simultaneously without the control of a central
+server. However, existing theories claim that decentralization invariably
+undermines generalization. In this paper, we challenge the conventional belief
+and present a completely new perspective for understanding decentralized
+learning. We prove that D-SGD implicitly minimizes the loss function of an
+average-direction Sharpness-aware minimization (SAM) algorithm under general
+non-convex non-$\beta$-smooth settings. This surprising asymptotic equivalence
+reveals an intrinsic regularization-optimization trade-off and three advantages
+of decentralization: (1) there exists a free uncertainty evaluation mechanism
+in D-SGD to improve posterior estimation; (2) D-SGD exhibits a gradient
+smoothing effect; and (3) the sharpness regularization effect of D-SGD does not
+decrease as total batch size increases, which justifies the potential
+generalization benefit of D-SGD over centralized SGD (C-SGD) in large-batch
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the 40th International Conference on
+  Machine Learning (ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlignDet: Aligning <span class="highlight-title">Pre-train</span>ing and Fine-tuning in Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Jie Wu, Xionghui Wang, Chen Chen, Jie Qin, Xuefeng Xiao, Rui Wang, Min Zheng, Xin Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paradigm of large-scale pre-training followed by downstream fine-tuning
+has been widely employed in various object detection algorithms. In this paper,
+we reveal discrepancies in data, model, and task between the pre-training and
+fine-tuning procedure in existing practices, which implicitly limit the
+detector's performance, generalization ability, and convergence speed. To this
+end, we propose AlignDet, a unified pre-training framework that can be adapted
+to various existing detectors to alleviate the discrepancies. AlignDet
+decouples the pre-training process into two stages, i.e., image-domain and
+box-domain pre-training. The image-domain pre-training optimizes the detection
+backbone to capture holistic visual abstraction, and box-domain pre-training
+learns instance-level semantics and task-aware concepts to initialize the parts
+out of the backbone. By incorporating the self-supervised pre-trained
+backbones, we can pre-train all modules for various detectors in an
+unsupervised paradigm. As depicted in Figure 1, extensive experiments
+demonstrate that AlignDet can achieve significant improvements across diverse
+protocols, such as detection algorithm, model backbone, data setting, and
+training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by
+2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready Version on ICCV 2023. Code and Models are publicly
+  available. Project Page: https://liming-ai.github.io/AlignDet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-grained Graph Learning for Multi-view Subspace Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.04604v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.04604v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidi Wang, Xiaobing Pei, Haoxi Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view subspace clustering (MSC) is a popular unsupervised method by
+integrating heterogeneous information to reveal the intrinsic clustering
+structure hidden across views. Usually, MSC methods use graphs (or affinity
+matrices) fusion to learn a common structure, and further apply graph-based
+approaches to clustering. Despite progress, most of the methods do not
+establish the connection between graph learning and clustering. Meanwhile,
+conventional graph fusion strategies assign coarse-grained weights to combine
+multi-graph, ignoring the importance of local structure. In this paper, we
+propose a fine-grained graph learning framework for multi-view subspace
+clustering (FGL-MSC) to address these issues. To utilize the multi-view
+information sufficiently, we design a specific graph learning method by
+introducing graph regularization and a local structure fusion pattern. The main
+challenge is how to optimize the fine-grained fusion weights while generating
+the learned graph that fits the clustering task, thus making the clustering
+representation meaningful and competitive. Accordingly, an iterative algorithm
+is proposed to solve the above joint optimization problem, which obtains the
+learned graph, the clustering representation, and the fusion weights
+simultaneously. Extensive experiments on eight real-world datasets show that
+the proposed framework has comparable performance to the state-of-the-art
+methods. The source code of the proposed method is available at
+https://github.com/siriuslay/FGL-MSC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral Ranking Inferences based on General Multiway Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02918v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02918v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqing Fan, Zhipeng Lou, Weichen Wang, Mengxin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the performance of the spectral method in the estimation
+and uncertainty quantification of the unobserved preference scores of compared
+entities in a very general and more realistic setup in which the comparison
+graph consists of hyper-edges of possible heterogeneous sizes and the number of
+comparisons can be as low as one for a given hyper-edge. Such a setting is
+pervasive in real applications, circumventing the need to specify the graph
+randomness and the restrictive homogeneous sampling assumption imposed in the
+commonly-used Bradley-Terry-Luce (BTL) or Plackett-Luce (PL) models.
+Furthermore, in the scenarios when the BTL or PL models are appropriate, we
+unravel the relationship between the spectral estimator and the Maximum
+Likelihood Estimator (MLE). We discover that a two-step spectral method, where
+we apply the optimal weighting estimated from the equal weighting vanilla
+spectral method, can achieve the same asymptotic efficiency as the MLE. Given
+the asymptotic distributions of the estimated preference scores, we also
+introduce a comprehensive framework to carry out both one-sample and two-sample
+ranking inferences, applicable to both fixed and random graph settings. It is
+noteworthy that it is the first time effective two-sample rank testing methods
+are proposed. Finally, we substantiate our findings via comprehensive numerical
+simulations and subsequently apply our developed methodologies to perform
+statistical inferences on statistics journals and movie rankings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Q-Learning for MDPs with General Spaces: Convergence and Near Optimality
+  via Quantization under Weak Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.06781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.06781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Devran Kara, Naci Saldi, Serdar Yüksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning algorithms often require finiteness of state and
+action spaces in Markov decision processes (MDPs) (also called controlled
+Markov chains) and various efforts have been made in the literature towards the
+applicability of such algorithms for continuous state and action spaces. In
+this paper, we show that under very mild regularity conditions (in particular,
+involving only weak continuity of the transition kernel of an MDP), Q-learning
+for standard Borel MDPs via quantization of states and actions (called
+Quantized Q-Learning) converges to a limit, and furthermore this limit
+satisfies an optimality equation which leads to near optimality with either
+explicit performance bounds or which are guaranteed to be asymptotically
+optimal. Our approach builds on (i) viewing quantization as a measurement
+kernel and thus a quantized MDP as a partially observed Markov decision process
+(POMDP), (ii) utilizing near optimality and convergence results of Q-learning
+for POMDPs, and (iii) finally, near-optimality of finite state model
+approximations for MDPs with weakly continuous kernels which we show to
+correspond to the fixed point of the constructed POMDP. Thus, our paper
+presents a very general convergence and approximation result for the
+applicability of Q-learning for continuous MDPs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tiny-PPG: A Lightweight Deep Neural Network for Real-Time Detection of
+  Motion Artifacts in Photoplethysmogram Signals on Edge Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03308v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03308v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yali Zheng, Chen Wu, Peizheng Cai, Zhiqiang Zhong, Hongda Huang, Yuqi Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photoplethysmogram (PPG) signals are easily contaminated by motion artifacts
+in real-world settings, despite their widespread use in Internet-of-Things
+(IoT) based wearable and smart health devices for cardiovascular health
+monitoring. This study proposed a lightweight deep neural network, called
+Tiny-PPG, for accurate and real-time PPG artifact segmentation on IoT edge
+devices. The model was trained and tested on a public dataset, PPG DaLiA, which
+featured complex artifacts with diverse lengths and morphologies during various
+daily activities of 15 subjects using a watch-type device (Empatica E4). The
+model structure, training method and loss function were specifically designed
+to balance detection accuracy and speed for real-time PPG artifact detection in
+resource-constrained embedded devices. To optimize the model size and
+capability in multi-scale feature representation, the model employed depth-wise
+separable convolution and atrous spatial pyramid pooling modules, respectively.
+Additionally, the contrastive loss was also utilized to further optimize the
+feature embeddings. With additional model pruning, Tiny-PPG achieved
+state-of-the-art detection accuracy of 87.4% while only having 19,726 model
+parameters (0.15 megabytes), and was successfully deployed on an STM32 embedded
+system for real-time PPG artifact detection. Therefore, this study provides an
+effective solution for resource-constraint IoT smart health devices in PPG
+artifact detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicen Guo, Jiahang Li, Shuai Su, Yi Feng, Dacheng Zhou, Chen Chen, Denghuang Zhang, Xingyi Zhu, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is seen that there is enormous potential to leverage powerful deep
+learning methods in the emerging field of urban digital twins. It is
+particularly in the area of intelligent road inspection where there is
+currently limited research and data available. To facilitate progress in this
+field, we have developed a well-labeled road pothole dataset named Urban
+Digital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this
+dataset will enable the use of powerful deep learning methods in urban road
+inspection, providing algorithms with a more comprehensive understanding of the
+scene and maximizing their potential. Our dataset comprises 1000 images of
+potholes, captured in various scenarios with different lighting and humidity
+conditions. Our intention is to employ this dataset for object detection,
+semantic segmentation, and instance segmentation tasks. Our team has devoted
+significant effort to conducting a detailed statistical analysis, and
+benchmarking a selection of representative algorithms from recent years. We
+also provide a multi-task platform for researchers to fully exploit the
+performance of various algorithms with the support of UDTIRI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Database webpage: https://www.udtiri.com/, Kaggle webpage:
+  https://www.kaggle.com/datasets/jiahangli617/udtiri</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Permutation Decision Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harikrishnan N B, Nithin Nagaraj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision Tree is a well understood Machine Learning model that is based on
+minimizing impurities in the internal nodes. The most common impurity measures
+are Shannon entropy and Gini impurity. These impurity measures are insensitive
+to the order of training data and hence the final tree obtained is invariant to
+any permutation of the data. This leads to a serious limitation in modeling
+data instances that have order dependencies. In this work, we propose the use
+of Effort-To-Compress (ETC) - a complexity measure, for the first time, as an
+impurity measure. Unlike Shannon entropy and Gini impurity, structural impurity
+based on ETC is able to capture order dependencies in the data, thus obtaining
+potentially different decision trees for different permutations of the same
+data instances (Permutation Decision Trees). We then introduce the notion of
+Permutation Bagging achieved using permutation decision trees without the need
+for random feature selection and sub-sampling. We compare the performance of
+the proposed permutation bagged decision trees with Random Forests. Our model
+does not assume that the data instances are independent and identically
+distributed. Potential applications include scenarios where a temporal order
+present in the data instances is to be respected.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximation and Non-parametric Estimation of ResNet-type Convolutional
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1903.10047v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1903.10047v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenta Oono, Taiji Suzuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) have been shown to achieve optimal
+approximation and estimation error rates (in minimax sense) in several function
+classes. However, previous analyzed optimal CNNs are unrealistically wide and
+difficult to obtain via optimization due to sparse constraints in important
+function classes, including the H\"older class. We show a ResNet-type CNN can
+attain the minimax optimal error rates in these classes in more plausible
+situations -- it can be dense, and its width, channel size, and filter size are
+constant with respect to sample size. The key idea is that we can replicate the
+learning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as
+long as the FNNs have \textit{block-sparse} structures. Our theory is general
+in a sense that we can automatically translate any approximation rate achieved
+by block-sparse FNNs into that by CNNs. As an application, we derive
+approximation and estimation error rates of the aformentioned type of CNNs for
+the Barron and H\"older classes with the same strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 4: Fixed the constant B^{(fc)} in Theorems 1, 5 and the norm
+  upper bound of w^{(l)}_m in Lemma 1. 8 pages + References 2 pages +
+  Supplemental material 18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonic (Quantum) Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atiyo Ghosh, Antonio A. Gentile, Mario Dagrada, Chul Lee, Seong-Hyok Kim, Hyukgeun Cha, Yunjun Choi, Brad Kim, Jeong-Il Kye, Vincent E. Elfving
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Harmonic functions are abundant in nature, appearing in limiting cases of
+Maxwell's, Navier-Stokes equations, the heat and the wave equation.
+Consequently, there are many applications of harmonic functions from industrial
+process optimisation to robotic path planning and the calculation of first exit
+times of random walks. Despite their ubiquity and relevance, there have been
+few attempts to incorporate inductive biases towards harmonic functions in
+machine learning contexts. In this work, we demonstrate effective means of
+representing harmonic functions in neural networks and extend such results also
+to quantum neural networks to demonstrate the generality of our approach. We
+benchmark our approaches against (quantum) physics-informed neural networks,
+where we show favourable performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages (main), 7 pages (supplementary), 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01482v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01482v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Guoyang Qin, Lijun Sun, Yunpeng Wang, Jian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatiotemporal graph neural networks (STGNNs) have emerged as a leading
+approach for learning representations and forecasting on traffic datasets with
+underlying topological and correlational structures. However, current STGNNs
+use intricate techniques with high complexities to capture these structures,
+making them difficult to understand and scale. The existence of simple yet
+efficient architectures remains an open question. Upon closer examination, we
+find what lies at the core of STGNN's representations are certain forms of
+spatiotemporal contextualization. In light of this, we design Nexus sine qua
+non (NexuSQN), an essentially connected network built on an efficient
+message-passing backbone. NexuSQN simply uses learnable "where" and "when"
+locators for the aforementioned contextualization and omits any intricate
+components such as RNNs, Transformers, and diffusion convolutions. Results show
+that NexuSQN outperforms intricately designed benchmarks in terms of size,
+computational efficiency, and accuracy. This suggests a promising future for
+developing simple yet efficient neural predictors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Based Safe Reinforcement Learning with Time-Varying State and
+  Control Constraints: An Application to Intelligent Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.11217v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.11217v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinglong Zhang, Yaoqian Peng, Biao Luo, Wei Pan, Xin Xu, Haibin Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, safe reinforcement learning (RL) with the actor-critic structure
+for continuous control tasks has received increasing attention. It is still
+challenging to learn a near-optimal control policy with safety and convergence
+guarantees. Also, few works have addressed the safe RL algorithm design under
+time-varying safety constraints. This paper proposes a safe RL algorithm for
+optimal control of nonlinear systems with time-varying state and control
+constraints. In the proposed approach, we construct a novel barrier force-based
+control policy structure to guarantee control safety. A multi-step policy
+evaluation mechanism is proposed to predict the policy's safety risk under
+time-varying safety constraints and guide the policy to update safely.
+Theoretical results on stability and robustness are proven. Also, the
+convergence of the actor-critic implementation is analyzed. The performance of
+the proposed algorithm outperforms several state-of-the-art RL algorithms in
+the simulated Safety Gym environment. Furthermore, the approach is applied to
+the integrated path following and collision avoidance problem for two
+real-world intelligent vehicles. A differential-drive vehicle and an
+Ackermann-drive one are used to verify offline deployment and online learning
+performance, respectively. Our approach shows an impressive sim-to-real
+transfer capability and a satisfactory online control performance in the
+experiment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-view Graph Convolutional Networks with Differentiable Node
+  Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoliang Chen, Lele Fu, Shunxin Xiao, Shiping Wang, Claudia Plant, Wenzhong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view data containing complementary and consensus information can
+facilitate representation learning by exploiting the intact integration of
+multi-view features. Because most objects in real world often have underlying
+connections, organizing multi-view data as heterogeneous graphs is beneficial
+to extracting latent information among different objects. Due to the powerful
+capability to gather information of neighborhood nodes, in this paper, we apply
+Graph Convolutional Network (GCN) to cope with heterogeneous-graph data
+originating from multi-view data, which is still under-explored in the field of
+GCN. In order to improve the quality of network topology and alleviate the
+interference of noises yielded by graph fusion, some methods undertake sorting
+operations before the graph convolution procedure. These GCN-based methods
+generally sort and select the most confident neighborhood nodes for each
+vertex, such as picking the top-k nodes according to pre-defined confidence
+values. Nonetheless, this is problematic due to the non-differentiable sorting
+operators and inflexible graph embedding learning, which may result in blocked
+gradient computations and undesired performance. To cope with these issues, we
+propose a joint framework dubbed Multi-view Graph Convolutional Network with
+Differentiable Node Selection (MGCN-DNS), which is constituted of an adaptive
+graph fusion layer, a graph learning module and a differentiable node selection
+schema. MGCN-DNS accepts multi-channel graph-structural data as inputs and aims
+to learn more robust graph fusion through a differentiable neural network. The
+effectiveness of the proposed method is verified by rigorous comparisons with
+considerable state-of-the-art approaches in terms of multi-view semi-supervised
+classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SmartGD: A GAN-Based Graph Drawing Framework for Diverse Aesthetic Goals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.06434v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.06434v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqi Wang, Kevin Yen, Yifan Hu, Han-Wei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a multitude of studies have been conducted on graph drawing, many
+existing methods only focus on optimizing a single aesthetic aspect of graph
+layouts, which can lead to sub-optimal results. There are a few existing
+methods that have attempted to develop a flexible solution for optimizing
+different aesthetic aspects measured by different aesthetic criteria.
+Furthermore, thanks to the significant advance in deep learning techniques,
+several deep learning-based layout methods were proposed recently. These
+methods have demonstrated the advantages of deep learning approaches for graph
+drawing. However, none of these existing methods can be directly applied to
+optimizing non-differentiable criteria without special accommodation. In this
+work, we propose a novel Generative Adversarial Network (GAN) based deep
+learning framework for graph drawing, called SmartGD, which can optimize
+different quantitative aesthetic goals, regardless of their differentiability.
+To demonstrate the effectiveness and efficiency of SmartGD, we conducted
+experiments on minimizing stress, minimizing edge crossing, maximizing crossing
+angle, maximizing shape-based metrics, and a combination of multiple
+aesthetics. Compared with several popular graph drawing algorithms, the
+experimental results show that SmartGD achieves good performance both
+quantitatively and qualitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kairos: Practical Intrusion Detection and Investigation using
+  Whole-system Provenance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijun Cheng, Qiujian Lv, Jinyuan Liang, Yan Wang, Degang Sun, Thomas Pasquier, Xueyuan Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Provenance graphs are structured audit logs that describe the history of a
+system's execution. Recent studies have explored a variety of techniques to
+analyze provenance graphs for automated host intrusion detection, focusing
+particularly on advanced persistent threats. Sifting through their design
+documents, we identify four common dimensions that drive the development of
+provenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect
+modern attacks that infiltrate across application boundaries?), attack
+agnosticity (can PIDSes detect novel attacks without a priori knowledge of
+attack characteristics?), timeliness (can PIDSes efficiently monitor host
+systems as they run?), and attack reconstruction (can PIDSes distill attack
+activity from large provenance graphs so that sysadmins can easily understand
+and quickly respond to system intrusion?). We present KAIROS, the first PIDS
+that simultaneously satisfies the desiderata in all four dimensions, whereas
+existing approaches sacrifice at least one and struggle to achieve comparable
+detection performance.
+  Kairos leverages a novel graph neural network-based encoder-decoder
+architecture that learns the temporal evolution of a provenance graph's
+structural changes to quantify the degree of anomalousness for each system
+event. Then, based on this fine-grained information, Kairos reconstructs attack
+footprints, generating compact summary graphs that accurately describe
+malicious activity over a stream of system audit logs. Using state-of-the-art
+benchmark datasets, we demonstrate that Kairos outperforms previous approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 16 figures, to appear in the 45th IEEE Symposium on
+  Security and Privacy (S&P'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Overparameterized random feature regression with nearly orthogonal data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06077v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06077v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Wang, Yizhe Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the properties of random feature ridge regression (RFRR) given
+by a two-layer neural network with random Gaussian initialization. We study the
+non-asymptotic behaviors of the RFRR with nearly orthogonal deterministic
+unit-length input data vectors in the overparameterized regime, where the width
+of the first layer is much larger than the sample size. Our analysis shows
+high-probability non-asymptotic concentration results for the training errors,
+cross-validations, and generalization errors of RFRR centered around their
+respective values for a kernel ridge regression (KRR). This KRR is derived from
+an expected kernel generated by a nonlinear random feature map. We then
+approximate the performance of the KRR by a polynomial kernel matrix obtained
+from the Hermite polynomial expansion of the activation function, whose degree
+only depends on the orthogonality among different data points. This polynomial
+kernel determines the asymptotic behavior of the RFRR and the KRR. Our results
+hold for a wide variety of activation functions and input data sets that
+exhibit nearly orthogonal properties. Based on these approximations, we obtain
+a lower bound for the generalization error of the RFRR for a nonlinear
+student-teacher model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages. A condition on the activation function is added in
+  Assumption 2.2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Review</span> of medical data analysis based on spiking neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        X. Li, X. Zhang, X. Yi, D. Liu, H. Wang, B. Zhang, B. Zhang, D. Zhao, L. Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical data mainly includes various types of biomedical signals and medical
+images, which can be used by professional doctors to make judgments on
+patients' health conditions. However, the interpretation of medical data
+requires a lot of human cost and there may be misjudgments, so many scholars
+use neural networks and deep learning to classify and study medical data, which
+can improve the efficiency and accuracy of doctors and detect diseases early
+for early diagnosis, etc. Therefore, it has a wide range of application
+prospects. However, traditional neural networks have disadvantages such as high
+energy consumption and high latency (slow computation speed). This paper
+presents recent research on signal classification and disease diagnosis based
+on a third-generation neural network, the spiking neuron network, using medical
+data including EEG signals, ECG signals, EMG signals and MRI images. The
+advantages and disadvantages of pulsed neural networks compared with
+traditional networks are summarized and its development orientation in the
+future is prospected.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Slice <span class="highlight-title">Transformer</span> and <span class="highlight-title">Self-supervised</span> Learning for 6DoF Localization in
+  3D Point Cloud Maps <span class="chip">ICRA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08957v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08957v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ibrahim, Naveed Akhtar, Saeed Anwar, Michael Wise, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise localization is critical for autonomous vehicles. We present a
+self-supervised learning method that employs Transformers for the first time
+for the task of outdoor localization using LiDAR data. We propose a pre-text
+task that reorganizes the slices of a $360^\circ$ LiDAR scan to leverage its
+axial properties. Our model, called Slice Transformer, employs multi-head
+attention while systematically processing the slices. To the best of our
+knowledge, this is the first instance of leveraging multi-head attention for
+outdoor point clouds. We additionally introduce the Perth-WA dataset, which
+provides a large-scale LiDAR map of Perth city in Western Australia, covering
+$\sim$4km$^2$ area. Localization annotations are provided for Perth-WA. The
+proposed localization method is thoroughly evaluated on Perth-WA and
+Appollo-SouthBay datasets. We also establish the efficacy of our
+self-supervised learning approach for the common downstream task of object
+classification using ModelNet40 and ScanNN datasets. The code and Perth-WA data
+will be publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Robotics and Automation
+  (ICRA), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Decision-Focused Learning in Restless Multi-Armed Bandits with
+  Application to Maternal and Child Health 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.00916v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.00916v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wang, Shresth Verma, Aditya Mate, Sanket Shah, Aparna Taneja, Neha Madhiwalla, Aparna Hegde, Milind Tambe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies restless multi-armed bandit (RMAB) problems with unknown
+arm transition dynamics but with known correlated arm features. The goal is to
+learn a model to predict transition dynamics given features, where the Whittle
+index policy solves the RMAB problems using predicted transitions. However,
+prior works often learn the model by maximizing the predictive accuracy instead
+of final RMAB solution quality, causing a mismatch between training and
+evaluation objectives. To address this shortcoming, we propose a novel approach
+for decision-focused learning in RMAB that directly trains the predictive model
+to maximize the Whittle index solution quality. We present three key
+contributions: (i) we establish differentiability of the Whittle index policy
+to support decision-focused learning; (ii) we significantly improve the
+scalability of decision-focused learning approaches in sequential problems,
+specifically RMAB problems; (iii) we apply our algorithm to a previously
+collected dataset of maternal and child health to demonstrate its performance.
+Indeed, our algorithm is the first for decision-focused learning in RMAB that
+scales to real-world problem sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UGC Quality Assessment: Exploring the Impact of Saliency in Deep
+  Feature-Based Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wang, Angeliki Katsenou, David Bull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volume of User Generated Content (UGC) has increased in recent years. The
+challenge with this type of content is assessing its quality. So far, the
+state-of-the-art metrics are not exhibiting a very high correlation with
+perceptual quality. In this paper, we explore state-of-the-art metrics that
+extract/combine natural scene statistics and deep neural network features. We
+experiment with these by introducing saliency maps to improve perceptibility.
+We train and test our models using public datasets, namely, YouTube-UGC and
+KoNViD-1k. Preliminary results indicate that high correlations are achieved by
+using only deep features while adding saliency is not always boosting the
+performance. Our results and code will be made publicly available to serve as a
+benchmark for the research community and can be found on our project page:
+https://github.com/xinyiW915/SPIE-2023-Supplementary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Yin, Dejia Xu, Chuangchuang Tan, Ping Liu, Yao Zhao, Yunchao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low light enhancement has gained increasing importance with the rapid
+development of visual creation and editing. However, most existing enhancement
+algorithms are designed to homogeneously increase the brightness of images to a
+pre-defined extent, limiting the user experience. To address this issue, we
+propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a
+novel diffusion framework to provide users with rich controllability. Built
+with a conditional diffusion model, we introduce an illumination embedding to
+let users control their desired brightness level. Additionally, we incorporate
+the Segment-Anything Model (SAM) to enable user-friendly region
+controllability, where users can click on objects to specify the regions they
+wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves
+competitive performance regarding quantitative metrics, qualitative results,
+and versatile controllability. Project page:
+\url{https://yuyangyin.github.io/CLEDiffusion/}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MACO: A Modality Adversarial and Contrastive Framework for
+  Modality-missing Multi-modal Knowledge Graph Completion <span class="chip">NLPCC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Zhuo Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen significant advancements in multi-modal knowledge
+graph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by
+integrating multi-modal entity information, thereby facilitating the discovery
+of unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,
+existing methods emphasize the design of elegant KGC models to facilitate
+modality interaction, neglecting the real-life problem of missing modalities in
+KGs. The missing modality information impedes modal interaction, consequently
+undermining the model's performance. In this paper, we propose a modality
+adversarial and contrastive framework (MACO) to solve the modality-missing
+problem in MMKGC. MACO trains a generator and discriminator adversarially to
+generate missing modality features that can be incorporated into the MMKGC
+model. Meanwhile, we design a cross-modal contrastive loss to improve the
+performance of the generator. Experiments on public benchmarks with further
+explorations demonstrate that MACO could achieve state-of-the-art results and
+serve as a versatile framework to bolster various MMKGC models. Our code and
+benchmark data are available at https://github.com/zjukg/MACO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the ArXiv version of our paper accepted by NLPCC 2023. The
+  code will be released soon</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-12T00:00:00Z">2023-08-12</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">29</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bio-SIEVE: Exploring Instruction Tuning Large Language Models for
+  Systematic <span class="highlight-title">Review</span> Automation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ambrose Robinson, William Thorne, Ben P. Wu, Abdullah Pandor, Munira Essat, Mark Stevenson, Xingyi Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical systematic reviews can be very costly and resource intensive. We
+explore how Large Language Models (LLMs) can support and be trained to perform
+literature screening when provided with a detailed set of selection criteria.
+Specifically, we instruction tune LLaMA and Guanaco models to perform abstract
+screening for medical systematic reviews. Our best model, Bio-SIEVE,
+outperforms both ChatGPT and trained traditional approaches, and generalises
+better across medical domains. However, there remains the challenge of adapting
+the model to safety-first scenarios. We also explore the impact of multi-task
+training with Bio-SIEVE-Multi, including tasks such as PICO extraction and
+exclusion reasoning, but find that it is unable to match single-task
+Bio-SIEVE's performance. We see Bio-SIEVE as an important step towards
+specialising LLMs for the biomedical systematic review process and explore its
+future developmental opportunities. We release our models, code and a list of
+DOIs to reconstruct our dataset for reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VisIT-Bench: A Benchmark for Vision-Language Instruction Following
+  Inspired by Real-World Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonatan Bitton, Hritik Bansal, Jack Hessel, Rulin Shao, Wanrong Zhu, Anas Awadalla, Josh Gardner, Rohan Taori, Ludwig Schimdt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for
+evaluation of instruction-following vision-language models for real-world use.
+Our starting point is curating 70 'instruction families' that we envision
+instruction tuned vision-language models should be able to address. Extending
+beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to
+game playing and creative generation. Following curation, our dataset comprises
+592 test queries, each with a human-authored instruction-conditioned caption.
+These descriptions surface instruction-specific factors, e.g., for an
+instruction asking about the accessibility of a storefront for wheelchair
+users, the instruction-conditioned caption describes ramps/potential obstacles.
+These descriptions enable 1) collecting human-verified reference outputs for
+each instance; and 2) automatic evaluation of candidate multimodal generations
+using a text-only LLM, aligning with human judgment. We quantify quality gaps
+between models and references using both human and automatic evaluations; e.g.,
+the top-performing instruction-following model wins against the GPT-4 reference
+in just 27% of the comparison. VisIT-Bench is dynamic to participate,
+practitioners simply submit their model's response on the project website;
+Data, code and leaderboard is available at visit-bench.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MT4CrossOIE: Multi-stage Tuning for Cross-lingual Open Information
+  Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixiang Wang, Linzheng Chai, Jian Yang, Jiaqi Bai, Yuwei Yin, Jiaheng Liu, Hongcheng Guo, Tongliang Li, Liqun Yang, Hebboul Zine el-abidine, Zhoujun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-lingual open information extraction aims to extract structured
+information from raw text across multiple languages. Previous work uses a
+shared cross-lingual pre-trained model to handle the different languages but
+underuses the potential of the language-specific representation. In this paper,
+we propose an effective multi-stage tuning framework called MT4CrossIE,
+designed for enhancing cross-lingual open information extraction by injecting
+language-specific knowledge into the shared model. Specifically, the
+cross-lingual pre-trained model is first tuned in a shared semantic space
+(e.g., embedding matrix) in the fixed encoder and then other components are
+optimized in the second stage. After enough training, we freeze the pre-trained
+model and tune the multiple extra low-rank language-specific modules using
+mixture-of-LoRAs for model-based cross-lingual transfer. In addition, we
+leverage two-stage prompting to encourage the large language model (LLM) to
+annotate the multi-lingual raw data for data-based cross-lingual transfer. The
+model is trained with multi-lingual objectives on our proposed dataset
+OpenIE4++ by combing the model-based and data-based transfer techniques.
+Experimental results on various benchmarks emphasize the importance of
+aggregating multiple plug-in-and-play language-specific modules and demonstrate
+the effectiveness of MT4CrossIE in cross-lingual
+OIE\footnote{\url{https://github.com/CSJianYang/Multilingual-Multimodal-NLP}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alternative Pseudo-Labeling for Semi-Supervised Automatic Speech
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhu, Dongji Gao, Gaofeng Cheng, Daniel Povey, Pengyuan Zhang, Yonghong Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When labeled data is insufficient, semi-supervised learning with the
+pseudo-labeling technique can significantly improve the performance of
+automatic speech recognition. However, pseudo-labels are often noisy,
+containing numerous incorrect tokens. Taking noisy labels as ground-truth in
+the loss function results in suboptimal performance. Previous works attempted
+to mitigate this issue by either filtering out the nosiest pseudo-labels or
+improving the overall quality of pseudo-labels. While these methods are
+effective to some extent, it is unrealistic to entirely eliminate incorrect
+tokens in pseudo-labels. In this work, we propose a novel framework named
+alternative pseudo-labeling to tackle the issue of noisy pseudo-labels from the
+perspective of the training objective. The framework comprises several
+components. Firstly, a generalized CTC loss function is introduced to handle
+noisy pseudo-labels by accepting alternative tokens in the positions of
+incorrect tokens. Applying this loss function in pseudo-labeling requires
+detecting incorrect tokens in the predicted pseudo-labels. In this work, we
+adopt a confidence-based error detection method that identifies the incorrect
+tokens by comparing their confidence scores with a given threshold, thus
+necessitating the confidence score to be discriminative. Hence, the second
+proposed technique is the contrastive CTC loss function that widens the
+confidence gap between the correctly and incorrectly predicted tokens, thereby
+improving the error detection ability. Additionally, obtaining satisfactory
+performance with confidence-based error detection typically requires extensive
+threshold tuning. Instead, we propose an automatic thresholding method that
+uses labeled data as a proxy for determining the threshold, thus saving the
+pain of manual tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/ACM Transactions on Audio, Speech and Language
+  Processing (TASLP), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Yang, Soyeon Caren Han, Siqu Long, Josiah Poon, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting meaningful drug-related information chunks, such as adverse drug
+events (ADE), is crucial for preventing morbidity and saving many lives. Most
+ADE are reported via an unstructured conversation with the medical context.
+Hence, applying a general entity recognition approach is not sufficient enough.
+The key is how to integrate and align multiple crucial aspects to detect drug
+event information, including drug event semantics, syntactic structures, and
+medical domain terminology. In this paper, we propose a new multi-aspect
+cross-integration framework for drug entity/event detection by capturing and
+aligning different context/language/knowledge properties from drug-related
+documents. We first construct multi-aspect encoders to describe semantic,
+syntactic, and medical document contextual information by conducting those slot
+tagging tasks, main drug entity/event detection, part-of-speech tagging, and
+general medical named entity recognition. Then, each encoder conducts cross
+integration and alignment with other contextual information in three ways,
+including the key-value cross, attention cross, and feedforward cross, so the
+multi-encoders are integrated in depth. Then, we perform extensive experiments
+on two widely used drug-related entity recognition downstream tasks, flat
+entity detection and discontinuous event extraction. Our model significantly
+outperforms all recent twelve state-of-the-art models. The implementation code
+will be released at~\url{https://github.com/adlnlp/mc-dre}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ With a Little Help from the Authors: Reproducing Human Evaluation of an
+  MT Error Detector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondřej Plátek, Mateusz Lango, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents our efforts to reproduce the results of the human
+evaluation experiment presented in the paper of Vamvas and Sennrich (2022),
+which evaluated an automatic system detecting over- and undertranslations
+(translations containing more or less information than the original) in machine
+translation (MT) outputs. Despite the high quality of the documentation and
+code provided by the authors, we discuss some problems we found in reproducing
+the exact experimental setup and offer recommendations for improving
+reproducibility. Our replicated results generally confirm the conclusions of
+the original study, but in some cases, statistically significant differences
+were observed, suggesting a high variability of human annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to
+  https://www.aclweb.org/portal/content/repronlp-shared-task-reproducibility-evaluations-nlp-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperFormer: Enhancing Entity and Relation Interaction for
+  Hyper-Relational Knowledge Graph Completion <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Hu, Víctor Gutiérrez-Basulto, Zhiliang Xiang, Ru Li, Jeff Z. Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyper-relational knowledge graphs (HKGs) extend standard knowledge graphs by
+associating attribute-value qualifiers to triples, which effectively represent
+additional fine-grained information about its associated triple.
+Hyper-relational knowledge graph completion (HKGC) aims at inferring unknown
+triples while considering its qualifiers. Most existing approaches to HKGC
+exploit a global-level graph structure to encode hyper-relational knowledge
+into the graph convolution message passing process. However, the addition of
+multi-hop information might bring noise into the triple prediction process. To
+address this problem, we propose HyperFormer, a model that considers
+local-level sequential information, which encodes the content of the entities,
+relations and qualifiers of a triple. More precisely, HyperFormer is composed
+of three different modules: an entity neighbor aggregator module allowing to
+integrate the information of the neighbors of an entity to capture different
+perspectives of it; a relation qualifier aggregator module to integrate
+hyper-relational knowledge into the corresponding relation to refine the
+representation of relational content; a convolution-based bidirectional
+interaction module based on a convolutional operation, capturing pairwise
+bidirectional interactions of entity-relation, entity-qualifier, and
+relation-qualifier. realize the depth perception of the content related to the
+current statement. Furthermore, we introduce a Mixture-of-Experts strategy into
+the feed-forward layers of HyperFormer to strengthen its representation
+capabilities while reducing the amount of model parameters and computation.
+Extensive experiments on three well-known datasets with four different
+conditions demonstrate HyperFormer's effectiveness. Datasets and code are
+available at https://github.com/zhiweihu1103/HKGC-HyperFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoConv: Automatically Generating Information-seeking Conversations
+  with Large Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siheng Li, Cheng Yang, Yichun Yin, Xinyu Zhu, Zesen Cheng, Lifeng Shang, Xin Jiang, Qun Liu, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information-seeking conversation, which aims to help users gather information
+through conversation, has achieved great progress in recent years. However, the
+research is still stymied by the scarcity of training data. To alleviate this
+problem, we propose AutoConv for synthetic conversation generation, which takes
+advantage of the few-shot learning ability and generation capacity of large
+language models (LLM). Specifically, we formulate the conversation generation
+problem as a language modeling task, then finetune an LLM with a few human
+conversations to capture the characteristics of the information-seeking process
+and use it for generating synthetic conversations with high quality.
+Experimental results on two frequently-used datasets verify that AutoConv has
+substantial improvements over strong baselines and alleviates the dependence on
+human annotation. In addition, we also provide several analysis studies to
+promote future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023 Main Conference (Short)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three Ways of Using Large Language Models to Evaluate Chat 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondřej Plátek, Vojtěch Hudeček, Patricia Schmidtová, Mateusz Lango, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the systems submitted by team6 for ChatEval, the DSTC 11
+Track 4 competition. We present three different approaches to predicting
+turn-level qualities of chatbot responses based on large language models
+(LLMs). We report improvement over the baseline using dynamic few-shot examples
+from a vector store for the prompts for ChatGPT. We also analyze the
+performance of the other two approaches and report needed improvements for
+future work. We developed the three systems over just two weeks, showing the
+potential of LLMs for this task. An ablation study conducted after the
+challenge deadline shows that the new Llama 2 models are closing the
+performance gap between ChatGPT and open-source LLMs. However, we find that the
+Llama 2 models do not benefit from few-shot examples in the same way as
+ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to DSTC11 workshop https://dstc11.dstc.community/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NewsDialogues: Towards Proactive News Grounded Conversation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siheng Li, Yichun Yin, Cheng Yang, Wangjie Jiang, Yiwei Li, Zesen Cheng, Lifeng Shang, Xin Jiang, Qun Liu, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hot news is one of the most popular topics in daily conversations. However,
+news grounded conversation has long been stymied by the lack of well-designed
+task definition and scarce data. In this paper, we propose a novel task,
+Proactive News Grounded Conversation, in which a dialogue system can
+proactively lead the conversation based on some key topics of the news. In
+addition, both information-seeking and chit-chat scenarios are included
+realistically, where the user may ask a series of questions about the news
+details or express their opinions and be eager to chat. To further develop this
+novel task, we collect a human-to-human Chinese dialogue dataset
+\ts{NewsDialogues}, which includes 1K conversations with a total of 14.6K
+utterances and detailed annotations for target topics and knowledge spans.
+Furthermore, we propose a method named Predict-Generate-Rank, consisting of a
+generator for grounded knowledge prediction and response generation, and a
+ranker for the ranking of multiple responses to alleviate the exposure bias. We
+conduct comprehensive experiments to demonstrate the effectiveness of the
+proposed method and further present several key findings and challenges to
+prompt future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023 Conference (Long Paper; Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Faithful Text From a Knowledge Graph with Noisy Reference
+  Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tahsina Hashem, Weiqing Wang, Derry Tanti Wijaya, Mohammed Eunus Ali, Yuan-Fang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG)-to-Text generation aims at generating fluent
+natural-language text that accurately represents the information of a given
+knowledge graph. While significant progress has been made in this task by
+exploiting the power of pre-trained language models (PLMs) with appropriate
+graph structure-aware modules, existing models still fall short of generating
+faithful text, especially when the ground-truth natural-language text contains
+additional information that is not present in the graph. In this paper, we
+develop a KG-to-text generation model that can generate faithful
+natural-language text from a given graph, in the presence of noisy reference
+text. Our framework incorporates two core ideas: Firstly, we utilize
+contrastive learning to enhance the model's ability to differentiate between
+faithful and hallucinated information in the text, thereby encouraging the
+decoder to generate text that aligns with the input graph. Secondly, we empower
+the decoder to control the level of hallucination in the generated text by
+employing a controllable text generation technique. We evaluate our model's
+performance through the standard quantitative metrics as well as a
+ChatGPT-based quantitative and qualitative analysis. Our evaluation
+demonstrates the superior performance of our model over state-of-the-art
+KG-to-text models on faithfulness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">GPT</span>-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youliang Yuan, Wenxiang Jiao, Wenxuan Wang, Jen-tse Huang, Pinjia He, Shuming Shi, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety lies at the core of the development of Large Language Models (LLMs).
+There is ample work on aligning LLMs with human ethics and preferences,
+including data filtering in pretraining, supervised fine-tuning, reinforcement
+learning from human feedback, and red teaming, etc. In this study, we discover
+that chat in cipher can bypass the safety alignment techniques of LLMs, which
+are mainly conducted in natural languages. We propose a novel framework
+CipherChat to systematically examine the generalizability of safety alignment
+to non-natural languages -- ciphers. CipherChat enables humans to chat with
+LLMs through cipher prompts topped with system role descriptions and few-shot
+enciphered demonstrations. We use CipherChat to assess state-of-the-art LLMs,
+including ChatGPT and GPT-4 for different representative human ciphers across
+11 safety domains in both English and Chinese. Experimental results show that
+certain ciphers succeed almost 100% of the time to bypass the safety alignment
+of GPT-4 in several safety domains, demonstrating the necessity of developing
+safety alignment for non-natural languages. Notably, we identify that LLMs seem
+to have a ''secret cipher'', and propose a novel SelfCipher that uses only role
+play and several demonstrations in natural language to evoke this capability.
+SelfCipher surprisingly outperforms existing human ciphers in almost all cases.
+Our code and data will be released at https://github.com/RobustNLP/CipherChat.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Video: a Two-stage Framework for Zero-shot Identity-agnostic
+  Talking-head Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Wang, Mengyu Dai, Keld Lundgaard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of ChatGPT has introduced innovative methods for information
+gathering and analysis. However, the information provided by ChatGPT is limited
+to text, and the visualization of this information remains constrained.
+Previous research has explored zero-shot text-to-video (TTV) approaches to
+transform text into videos. However, these methods lacked control over the
+identity of the generated audio, i.e., not identity-agnostic, hindering their
+effectiveness. To address this limitation, we propose a novel two-stage
+framework for person-agnostic video cloning, specifically focusing on TTV
+generation. In the first stage, we leverage pretrained zero-shot models to
+achieve text-to-speech (TTS) conversion. In the second stage, an audio-driven
+talking head generation method is employed to produce compelling videos
+privided the audio generated in the first stage. This paper presents a
+comparative analysis of different TTS and audio-driven talking head generation
+methods, identifying the most promising approach for future research and
+development. Some audio and videos samples can be found in the following link:
+https://github.com/ZhichaoWang970201/Text-to-Video/tree/main.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demonstration-based learning for few-shot biomedical named entity
+  recognition under machine reading comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06454v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06454v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leilei Su, Jian Chen, Yifan Peng, Cong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep learning techniques have shown significant achievements, they
+frequently depend on extensive amounts of hand-labeled data and tend to perform
+inadequately in few-shot scenarios. The objective of this study is to devise a
+strategy that can improve the model's capability to recognize biomedical
+entities in scenarios of few-shot learning. By redefining biomedical named
+entity recognition (BioNER) as a machine reading comprehension (MRC) problem,
+we propose a demonstration-based learning method to address few-shot BioNER,
+which involves constructing appropriate task demonstrations. In assessing our
+proposed method, we compared the proposed method with existing advanced methods
+using six benchmark datasets, including BC4CHEMD, BC5CDR-Chemical,
+BC5CDR-Disease, NCBI-Disease, BC2GM, and JNLPBA. We examined the models'
+efficacy by reporting F1 scores from both the 25-shot and 50-shot learning
+experiments. In 25-shot learning, we observed 1.1% improvements in the average
+F1 scores compared to the baseline method, reaching 61.7%, 84.1%, 69.1%, 70.1%,
+50.6%, and 59.9% on six datasets, respectively. In 50-shot learning, we further
+improved the average F1 scores by 1.0% compared to the baseline method,
+reaching 73.1%, 86.8%, 76.1%, 75.6%, 61.7%, and 65.4%, respectively. We
+reported that in the realm of few-shot learning BioNER, MRC-based language
+models are much more proficient in recognizing biomedical entities compared to
+the sequence labeling approach. Furthermore, our MRC-language models can
+compete successfully with fully-supervised learning methodologies that rely
+heavily on the availability of abundant annotated data. These results highlight
+possible pathways for future advancements in few-shot BioNER methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Model Also Works: A Novel Emotion Recognition Network in Textual
+  Conversation Based on Curriculum Learning Strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Li, Xiaoping Wang, Yingjian Liu, Qing Zhou, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion Recognition in Conversation (ERC) has emerged as a research hotspot
+in domains such as conversational robots and question-answer systems. How to
+efficiently and adequately retrieve contextual emotional cues has been one of
+the key challenges in the ERC task. Existing efforts do not fully model the
+context and employ complex network structures, resulting in excessive
+computational resource overhead without substantial performance improvement. In
+this paper, we propose a novel Emotion Recognition Network based on Curriculum
+Learning strategy (ERNetCL). The proposed ERNetCL primarily consists of
+Temporal Encoder (TE), Spatial Encoder (SE), and Curriculum Learning (CL) loss.
+We utilize TE and SE to combine the strengths of previous methods in a
+simplistic manner to efficiently capture temporal and spatial contextual
+information in the conversation. To simulate the way humans learn curriculum
+from easy to hard, we apply the idea of CL to the ERC task to progressively
+optimize the network parameters of ERNetCL. At the beginning of training, we
+assign lower learning weights to difficult samples. As the epoch increases, the
+learning weights for these samples are gradually raised. Extensive experiments
+on four datasets exhibit that our proposed method is effective and dramatically
+beats other baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Prediction for Multi-hop Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Samadi, Davood Rafiei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of Query Performance Prediction (QPP) for open-domain
+multi-hop Question Answering (QA), where the task is to estimate the difficulty
+of evaluating a multi-hop question over a corpus. Despite the extensive
+research on predicting the performance of ad-hoc and QA retrieval models, there
+has been a lack of study on the estimation of the difficulty of multi-hop
+questions. The problem is challenging due to the multi-step nature of the
+retrieval process, potential dependency of the steps and the reasoning
+involved. To tackle this challenge, we propose multHP, a novel pre-retrieval
+method for predicting the performance of open-domain multi-hop questions. Our
+extensive evaluation on the largest multi-hop QA dataset using several modern
+QA systems shows that the proposed model is a strong predictor of the
+performance, outperforming traditional single-hop QPP models. Additionally, we
+demonstrate that our approach can be effectively used to optimize the
+parameters of QA systems, such as the number of documents to be retrieved,
+resulting in improved overall retrieval performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Semantic Text Similarity to rank Hypernyms of Financial Terms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohom Ghosh, Ankush Chopra, Sudip Kumar Naskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the years, there has been a paradigm shift in how users access financial
+services. With the advancement of digitalization more users have been
+preferring the online mode of performing financial activities. This has led to
+the generation of a huge volume of financial content. Most investors prefer to
+go through these contents before making decisions. Every industry has terms
+that are specific to the domain it operates in. Banking and Financial Services
+are not an exception to this. In order to fully comprehend these contents, one
+needs to have a thorough understanding of the financial terms. Getting a basic
+idea about a term becomes easy when it is explained with the help of the broad
+category to which it belongs. This broad category is referred to as hypernym.
+For example, "bond" is a hypernym of the financial term "alternative
+debenture". In this paper, we propose a system capable of extracting and
+ranking hypernyms for a given financial term. The system has been trained with
+financial text corpora obtained from various sources like DBpedia [4],
+Investopedia, Financial Industry Business Ontology (FIBO), prospectus and so
+on. Embeddings of these terms have been extracted using FinBERT [3], FinISH [1]
+and fine-tuned using SentenceBERT [54]. A novel approach has been used to
+augment the training set with negative samples. It uses the hierarchy present
+in FIBO. Finally, we benchmark the system performance with that of the existing
+ones. We establish that it performs better than the existing ones and is also
+scalable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code base:
+  https://github.com/sohomghosh/FinSim_Financial_Hypernym_detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of
+  Synthetic and Compositional Images <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07274v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07274v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitzan Bitton-Guetta, Yonatan Bitton, Jack Hessel, Ludwig Schmidt, Yuval Elovici, Gabriel Stanovsky, Roy Schwartz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weird, unusual, and uncanny images pique the curiosity of observers because
+they challenge commonsense. For example, an image released during the 2022
+world cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo
+playing chess, which playfully violates our expectation that their competition
+should occur on the football field. Humans can easily recognize and interpret
+these unconventional images, but can AI models do the same? We introduce
+WHOOPS!, a new dataset and benchmark for visual commonsense. The dataset is
+comprised of purposefully commonsense-defying images created by designers using
+publicly-available image generation tools like Midjourney. We consider several
+tasks posed over the dataset. In addition to image captioning, cross-modal
+matching, and visual question answering, we introduce a difficult explanation
+generation task, where models must identify and explain why a given image is
+unusual. Our results show that state-of-the-art models such as GPT3 and BLIP2
+still lag behind human performance on WHOOPS!. We hope our dataset will inspire
+the development of AI models with stronger visual commonsense reasoning
+abilities. Data, models and code are available at the project website:
+whoops-benchmark.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Website: whoops-benchmark.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Guided Generation for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09702v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09702v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon T. Willard, Rémi Louf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we show how the problem of neural text generation can be
+constructively reformulated in terms of transitions between the states of a
+finite-state machine. This framework leads to an efficient approach to guiding
+text generation with regular expressions and context-free grammars by allowing
+the construction of an index over a language model's vocabulary. The approach
+is model agnostic, allows one to enforce domain-specific knowledge and
+constraints, and enables the construction of reliable interfaces by
+guaranteeing the structure of the generated text. It adds little overhead to
+the token sequence generation process and significantly outperforms existing
+solutions. An implementation is provided in the open source Python library
+Outlines
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Math<span class="highlight-title">BERT</span>: A <span class="highlight-title">Pre-train</span>ed Language Model for General NLP Tasks in
+  Mathematics Education <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.07340v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.07340v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Tracy Shen, Michiharu Yamashita, Ethan Prihar, Neil Heffernan, Xintao Wu, Ben Graff, Dongwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the introduction of the original BERT (i.e., BASE BERT), researchers
+have developed various customized BERT models with improved performance for
+specific domains and tasks by exploiting the benefits of transfer learning. Due
+to the nature of mathematical texts, which often use domain specific vocabulary
+along with equations and math symbols, we posit that the development of a new
+BERT model for mathematics would be useful for many mathematical downstream
+tasks. In this resource paper, we introduce our multi-institutional effort
+(i.e., two learning platforms and three academic institutions in the US) toward
+this need: MathBERT, a model created by pre-training the BASE BERT model on a
+large mathematical corpus ranging from pre-kindergarten (pre-k), to
+high-school, to college graduate level mathematical content. In addition, we
+select three general NLP tasks that are often used in mathematics education:
+prediction of knowledge component, auto-grading open-ended Q&A, and knowledge
+tracing, to demonstrate the superiority of MathBERT over BASE BERT. Our
+experiments show that MathBERT outperforms prior best methods by 1.2-22% and
+BASE BERT by 2-8% on these tasks. In addition, we build a mathematics specific
+vocabulary 'mathVocab' to train with MathBERT. We discover that MathBERT
+pre-trained with 'mathVocab' outperforms MathBERT trained with the BASE BERT
+vocabulary (i.e., 'origVocab'). MathBERT is currently being adopted at the
+participated leaning platforms: Stride, Inc, a commercial educational resource
+provider, and ASSISTments.org, a free online educational platform. We release
+MathBERT for public usage at: https://github.com/tbs17/MathBERT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2021 MATHAI4ED Workshop (Best Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of
+  LLMs by Validating Low-Confidence Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neeraj Varshney, Wenlin Yao, Hongming Zhang, Jianshu Chen, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently developed large language models have achieved remarkable success in
+generating fluent and coherent text. However, these models often tend to
+'hallucinate' which critically hampers their reliability. In this work, we
+address this crucial problem and propose an approach that actively detects and
+mitigates hallucinations during the generation process. Specifically, we first
+identify the candidates of potential hallucination leveraging the model's logit
+output values, check their correctness through a validation procedure, mitigate
+the detected hallucinations, and then continue with the generation process.
+Through extensive experiments with GPT-3.5 (text-davinci-003) on the 'article
+generation task', we first demonstrate the individual efficacy of our detection
+and mitigation techniques. Specifically, the detection technique achieves a
+recall of ~88% and the mitigation technique successfully mitigates 57.6% of the
+correctly detected hallucinations. Importantly, our mitigation technique does
+not introduce new hallucinations even in the case of incorrectly detected
+hallucinations, i.e., false positives. Then, we show that the proposed active
+detection and mitigation approach successfully reduces the hallucinations of
+the GPT-3.5 model from 47.5% to 14.5% on average. We further demonstrate the
+effectiveness and wide applicability of our approach through additional studies
+including performance on different types of questions (multi-hop and false
+premise questions) and with another LLM from a different model family (Vicuna).
+In summary, our work contributes to improving the reliability and
+trustworthiness of large language models, a crucial step en route to enabling
+their widespread adoption in real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update to include additional experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mismatching-Aware Unsupervised Translation Quality Estimation For
+  Low-Resource Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Azadi, Heshaam Faili, Mohammad Javad Dousti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translation Quality Estimation (QE) is the task of predicting the quality of
+machine translation (MT) output without any reference. This task has gained
+increasing attention as an important component in the practical applications of
+MT. In this paper, we first propose XLMRScore, which is a cross-lingual
+counterpart of BERTScore computed via the XLM-RoBERTa (XLMR) model. This metric
+can be used as a simple unsupervised QE method, while employing it results in
+two issues: firstly, the untranslated tokens leading to unexpectedly high
+translation scores, and secondly, the issue of mismatching errors between
+source and hypothesis tokens when applying the greedy matching in XLMRScore. To
+mitigate these issues, we suggest replacing untranslated words with the unknown
+token and the cross-lingual alignment of the pre-trained model to represent
+aligned words closer to each other, respectively. We evaluate the proposed
+method on four low-resource language pairs of WMT21 QE shared task, as well as
+a new English-Farsi test dataset introduced in this paper. Experiments show
+that our method could get comparable results with the supervised baseline for
+two zero-shot scenarios, i.e., with less than 0.01 difference in Pearson
+correlation, while outperforming unsupervised rivals in all the low-resource
+language pairs for above 8%, on average.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Language Resources and Evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>ing the Hidden Talent of Web-Scale Speech Models for Zero-Shot
+  Task Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puyuan Peng, Brian Yan, Shinji Watanabe, David Harwath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the emergent abilities of the recently proposed web-scale
+speech model Whisper, by adapting it to unseen tasks with prompt engineering.
+We selected three tasks: audio-visual speech recognition (AVSR), code-switched
+speech recognition (CS-ASR), and speech translation (ST) on unseen language
+pairs. We design task-specific prompts, by either leveraging another
+large-scale model, or simply manipulating the special tokens in the default
+prompts. Experiments show that compared to the default prompts, our proposed
+prompts improve performance by 10% to 45% on the three zero-shot tasks, and
+even outperform SotA supervised models on some datasets. In addition, our
+experiments reveal many interesting properties of Whisper, including its
+robustness to prompts, bias on accents, and the multilingual understanding in
+its latent space. Code is available at
+https://github.com/jasonppy/PromptingWhisper
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment
+  Classification and Act Recognition <span class="chip">NLPCC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zheng, Fei Li, Yuyang Chai, Chong Teng, Donghong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition
+(DAR) aims to predict the sentiment label and act label for each utterance in a
+dialog simultaneously. However, current methods encode the dialog context in
+only one direction, which limits their ability to thoroughly comprehend the
+context. Moreover, these methods overlook the explicit correlations between
+sentiment and act labels, which leads to an insufficient ability to capture
+rich sentiment and act clues and hinders effective and accurate reasoning. To
+address these issues, we propose a Bi-directional Multi-hop Inference Model
+(BMIM) that leverages a feature selection network and a bi-directional
+multi-hop inference network to iteratively extract and integrate rich sentiment
+and act clues in a bi-directional manner. We also employ contrastive learning
+and dual learning to explicitly model the correlations of sentiment and act
+labels. Our experiments on two widely-used datasets show that BMIM outperforms
+state-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1
+score in DSC. Additionally, Our proposed model not only improves the
+performance but also enhances the interpretability of the joint sentiment and
+act prediction task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NLPCC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DialogRE^C+: An Extension of DialogRE to Investigate How Much
+  Coreference Helps Relation Extraction in Dialogs <span class="chip">NLPCC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyun Xiong, Mengwei Dai, Fei Li, Hao Fei, Bobo Li, Shengqiong Wu, Donghong Ji, Chong Teng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue relation extraction (DRE) that identifies the relations between
+argument pairs in dialogue text, suffers much from the frequent occurrence of
+personal pronouns, or entity and speaker coreference. This work introduces a
+new benchmark dataset DialogRE^C+, introducing coreference resolution into the
+DRE scenario. With the aid of high-quality coreference knowledge, the reasoning
+of argument relations is expected to be enhanced. In DialogRE^C+ dataset, we
+manually annotate total 5,068 coreference chains over 36,369 argument mentions
+based on the existing DialogRE data, where four different coreference chain
+types namely speaker chain, person chain, location chain and organization chain
+are explicitly marked. We further develop 4 coreference-enhanced graph-based
+DRE models, which learn effective coreference representations for improving the
+DRE task. We also train a coreference resolution model based on our annotations
+and evaluate the effect of automatically extracted coreference chains
+demonstrating the practicality of our dataset and its potential to other
+domains and tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NLPCC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Disentanglement and Fusion on Modality and Context in
+  Conversational Multimodal Emotion Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04502v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04502v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bobo Li, Hao Fei, Lizi Liao, Yu Zhao, Chong Teng, Tat-Seng Chua, Donghong Ji, Fei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been a hot research topic to enable machines to understand human
+emotions in multimodal contexts under dialogue scenarios, which is tasked with
+multimodal emotion analysis in conversation (MM-ERC). MM-ERC has received
+consistent attention in recent years, where a diverse range of methods has been
+proposed for securing better task performance. Most existing works treat MM-ERC
+as a standard multimodal classification problem and perform multimodal feature
+disentanglement and fusion for maximizing feature utility. Yet after revisiting
+the characteristic of MM-ERC, we argue that both the feature multimodality and
+conversational contextualization should be properly modeled simultaneously
+during the feature disentanglement and fusion steps. In this work, we target
+further pushing the task performance by taking full consideration of the above
+insights. On the one hand, during feature disentanglement, based on the
+contrastive learning technique, we devise a Dual-level Disentanglement
+Mechanism (DDM) to decouple the features into both the modality space and
+utterance space. On the other hand, during the feature fusion stage, we propose
+a Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism
+(CRM) for multimodal and context integration, respectively. They together
+schedule the proper integrations of multimodal and context features.
+Specifically, CFM explicitly manages the multimodal feature contributions
+dynamically, while CRM flexibly coordinates the introduction of dialogue
+contexts. On two public MM-ERC datasets, our system achieves new
+state-of-the-art performance consistently. Further analyses demonstrate that
+all our proposed mechanisms greatly facilitate the MM-ERC task by making full
+use of the multimodal and context features adaptively. Note that our proposed
+methods have the great potential to facilitate a broader range of other
+conversational multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic
+  Role Labeling <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhao, Hao Fei, Yixin Cao, Bobo Li, Meishan Zhang, Jianguo Wei, Min Zhang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from
+given videos, by recognizing the predict-argument event structures and the
+interrelationships between events. While recent endeavors have put forth
+methods for VidSRL, they can be mostly subject to two key drawbacks, including
+the lack of fine-grained spatial scene perception and the insufficiently
+modeling of video temporality. Towards this end, this work explores a novel
+holistic spatio-temporal scene graph (namely HostSG) representation based on
+the existing dynamic scene graph structures, which well model both the
+fine-grained spatial semantics and temporal dynamics of videos for VidSRL.
+Built upon the HostSG, we present a nichetargeting VidSRL framework. A
+scene-event mapping mechanism is first designed to bridge the gap between the
+underlying scene structure and the high-level event semantic structure,
+resulting in an overall hierarchical scene-event (termed ICE) graph structure.
+We further perform iterative structure refinement to optimize the ICE graph,
+such that the overall structure representation can best coincide with end task
+demand. Finally, three subtask predictions of VidSRL are jointly decoded, where
+the end-to-end paradigm effectively avoids error propagation. On the benchmark
+dataset, our framework boosts significantly over the current best-performing
+model. Further analyses are shown for a better understanding of the advances of
+our methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Summaries as Captions: Generating Figure Captions for Scientific
+  Documents with Automated Text Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12324v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12324v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chieh-Yang Huang, Ting-Yao Hsu, Ryan Rossi, Ani Nenkova, Sungchul Kim, Gromit Yeuk-Yin Chan, Eunyee Koh, Clyde Lee Giles, Ting-Hao 'Kenneth' Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Good figure captions help paper readers understand complex scientific
+figures. Unfortunately, even published papers often have poorly written
+captions. Automatic caption generation could aid paper writers by providing
+good starting captions that can be refined for better quality. Prior work often
+treated figure caption generation as a vision-to-language task. In this paper,
+we show that it can be more effectively tackled as a text summarization task in
+scientific documents. We fine-tuned PEGASUS, a pre-trained abstractive
+summarization model, to specifically summarize figure-referencing paragraphs
+(e.g., "Figure 3 shows...") into figure captions. Experiments on large-scale
+arXiv figures show that our method outperforms prior vision methods in both
+automatic and human evaluations. We further conducted an in-depth investigation
+focused on two key challenges: (i) the common presence of low-quality
+author-written captions and (ii) the lack of clear standards for good captions.
+Our code and data are available at:
+https://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by INLG-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Answering Unseen Questions With Smaller Language Models Using Rationale
+  Generation and Dense Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04711v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04711v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Hartill, Diana Benavides-Prado, Michael Witbrock, Patricia J. Riddle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When provided with sufficient explanatory context, smaller Language Models
+have been shown to exhibit strong reasoning ability on challenging short-answer
+question-answering tasks where the questions are unseen in training. We
+evaluate two methods for further improvement in this setting. Both methods
+focus on combining rationales generated by a larger Language Model with longer
+contexts created from a multi-hop dense retrieval system. The first method
+($\textit{RR}$) involves training a Rationale Ranking model to score both
+generated rationales and retrieved contexts with respect to relevance and
+truthfulness. We then use the scores to derive combined contexts from both
+knowledge sources using a number of combinatory strategies. For the second
+method ($\textit{RATD}$) we train a smaller Reasoning model using
+retrieval-augmented training datasets such that it becomes proficient at
+utilising relevant information from longer text sequences that may be only
+partially evidential and frequently contain many irrelevant sentences.
+Generally we find that both methods are effective but that the $\textit{RATD}$
+method is more straightforward to apply and produces the strongest results in
+the unseen setting on which we focus. Our single best Reasoning model using
+only 440 million parameters materially improves upon strong comparable prior
+baselines for unseen evaluation datasets (StrategyQA 58.9 $\rightarrow$ 61.7
+acc., CommonsenseQA 63.6 $\rightarrow$ 72.7 acc., ARC-DA 31.6 $\rightarrow$
+52.1 F1, IIRC 25.5 $\rightarrow$ 27.3 F1) and a version utilising our prior
+knowledge of each type of question in selecting a context combination strategy
+does even better. Our proposed models also generally outperform direct prompts
+against much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot
+chain-of-thought and few-shot answer-only settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Learning for Cross-modal Artist Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Ferraro, Jaehun Kim, Sergio Oramas, Andreas Ehmann, Fabien Gouyon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music retrieval and recommendation applications often rely on content
+features encoded as embeddings, which provide vector representations of items
+in a music dataset. Numerous complementary embeddings can be derived from
+processing items originally represented in several modalities, e.g., audio
+signals, user interaction data, or editorial data. However, data of any given
+modality might not be available for all items in any music dataset. In this
+work, we propose a method based on contrastive learning to combine embeddings
+from multiple modalities and explore the impact of the presence or absence of
+embeddings from diverse modalities in an artist similarity task. Experiments on
+two datasets suggest that our contrastive method outperforms single-modality
+embeddings and baseline algorithms for combining modalities, both in terms of
+artist retrieval accuracy and coverage. Improvements with respect to other
+methods are particularly significant for less popular query artists. We
+demonstrate our method successfully combines complementary information from
+diverse modalities, and is more robust to missing modality data (i.e., it
+better handles the retrieval of artists with different modality embeddings than
+the query artist's).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-aware Event Forecasting via Graph Disentanglement <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshan Ma, Chenchen Ye, Zijian Wu, Xiang Wang, Yixin Cao, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event forecasting has been a demanding and challenging task throughout the
+entire human history. It plays a pivotal role in crisis alarming and disaster
+prevention in various aspects of the whole society. The task of event
+forecasting aims to model the relational and temporal patterns based on
+historical events and makes forecasting to what will happen in the future. Most
+existing studies on event forecasting formulate it as a problem of link
+prediction on temporal event graphs. However, such pure structured formulation
+suffers from two main limitations: 1) most events fall into general and
+high-level types in the event ontology, and therefore they tend to be
+coarse-grained and offers little utility which inevitably harms the forecasting
+accuracy; and 2) the events defined by a fixed ontology are unable to retain
+the out-of-ontology contextual information. To address these limitations, we
+propose a novel task of context-aware event forecasting which incorporates
+auxiliary contextual information. First, the categorical context provides
+supplementary fine-grained information to the coarse-grained events. Second and
+more importantly, the context provides additional information towards specific
+situation and condition, which is crucial or even determinant to what will
+happen next. However, it is challenging to properly integrate context into the
+event forecasting framework, considering the complex patterns in the
+multi-context scenario. Towards this end, we design a novel framework named
+Separation and Collaboration Graph Disentanglement (short as SeCoGD) for
+context-aware event forecasting. Since there is no available dataset for this
+novel task, we construct three large-scale datasets based on GDELT.
+Experimental results demonstrate that our model outperforms a list of SOTA
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023, 9 pages, 7 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Prediction for Multi-hop Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Samadi, Davood Rafiei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of Query Performance Prediction (QPP) for open-domain
+multi-hop Question Answering (QA), where the task is to estimate the difficulty
+of evaluating a multi-hop question over a corpus. Despite the extensive
+research on predicting the performance of ad-hoc and QA retrieval models, there
+has been a lack of study on the estimation of the difficulty of multi-hop
+questions. The problem is challenging due to the multi-step nature of the
+retrieval process, potential dependency of the steps and the reasoning
+involved. To tackle this challenge, we propose multHP, a novel pre-retrieval
+method for predicting the performance of open-domain multi-hop questions. Our
+extensive evaluation on the largest multi-hop QA dataset using several modern
+QA systems shows that the proposed model is a strong predictor of the
+performance, outperforming traditional single-hop QPP models. Additionally, we
+demonstrate that our approach can be effectively used to optimize the
+parameters of QA systems, such as the number of documents to be retrieved,
+resulting in improved overall retrieval performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Social4Rec: Distilling User Preference from Social Graph for Video
+  Recommendation in Tencent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09971v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09971v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanji Xiao, Huaqiang Dai, Qian Dong, Shuzi Niu, Yuzhen Liu, Pei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recommender systems play a key role in network content platforms,
+mining the user's interests is still a significant challenge. Existing works
+predict the user interest by utilizing user behaviors, i.e., clicks, views,
+etc., but current solutions are ineffective when users perform unsettled
+activities. The latter ones involve new users, which have few activities of any
+kind, and sparse users who have low-frequency behaviors. We uniformly describe
+both these user-types as "cold users", which are very common but often
+neglected in network content platforms. To address this issue, we enhance the
+representation of the user interest by combining his social interest, e.g.,
+friendship, following bloggers, interest groups, etc., with the activity
+behaviors. Thus, in this work, we present a novel algorithm entitled SocialNet,
+which adopts a two-stage method to progressively extract the coarse-grained and
+fine-grained social interest. Our technique then concatenates SocialNet's
+output with the original user representation to get the final user
+representation that combines behavior interests and social interests. Offline
+experiments on Tencent video's recommender system demonstrate the superiority
+over the baseline behavior-based model. The online experiment also shows a
+significant performance improvement in clicks and view time in the real-world
+recommendation system. The source code is available at
+https://github.com/Social4Rec/SocialNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">1</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A One-dimensional HEVC video steganalysis method using the Optimality of
+  Predicted Motion Vectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Li, Minqing Zhang, Ke Niu, Yingnan Zhang, Xiaoyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among steganalysis techniques, detection against motion vector (MV)
+domain-based video steganography in High Efficiency Video Coding (HEVC)
+standard remains a hot and challenging issue. For the purpose of improving the
+detection performance, this paper proposes a steganalysis feature based on the
+optimality of predicted MVs with a dimension of one. Firstly, we point out that
+the motion vector prediction (MVP) of the prediction unit (PU) encoded using
+the Advanced Motion Vector Prediction (AMVP) technique satisfies the local
+optimality in the cover video. Secondly, we analyze that in HEVC video, message
+embedding either using MVP index or motion vector differences (MVD) may destroy
+the above optimality of MVP. And then, we define the optimal rate of MVP in
+HEVC video as a steganalysis feature. Finally, we conduct steganalysis
+detection experiments on two general datasets for three popular steganography
+methods and compare the performance with four state-of-the-art steganalysis
+methods. The experimental results show that the proposed optimal rate of MVP
+for all cover videos is 100\%, while the optimal rate of MVP for all stego
+videos is less than 100\%. Therefore, the proposed steganography scheme can
+accurately distinguish between cover videos and stego videos, and it is
+efficiently applied to practical scenarios with no model training and low
+computational complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TCSVT</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-11T00:00:00Z">2023-08-11</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">48</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Alignment with Instruction Backtranslation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Li, Ping Yu, Chunting Zhou, Timo Schick, Luke Zettlemoyer, Omer Levy, Jason Weston, Mike Lewis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a scalable method to build a high quality instruction following
+language model by automatically labelling human-written text with corresponding
+instructions. Our approach, named instruction backtranslation, starts with a
+language model finetuned on a small amount of seed data, and a given web
+corpus. The seed model is used to construct training examples by generating
+instruction prompts for web documents (self-augmentation), and then selecting
+high quality examples from among these candidates (self-curation). This data is
+then used to finetune a stronger model. Finetuning LLaMa on two iterations of
+our approach yields a model that outperforms all other LLaMa-based models on
+the Alpaca leaderboard not relying on distillation data, demonstrating highly
+effective self-alignment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KETM:A Knowledge-Enhanced Text Matching method <span class="chip">IJCNN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Jiang, Yahui Zhao, Guozhe Jin, Zhenguo Zhang, Rongyi Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text matching is the task of matching two texts and determining the
+relationship between them, which has extensive applications in natural language
+processing tasks such as reading comprehension, and Question-Answering systems.
+The mainstream approach is to compute text representations or to interact with
+the text through attention mechanism, which is effective in text matching
+tasks. However, the performance of these models is insufficient for texts that
+require commonsense knowledge-based reasoning. To this end, in this paper, We
+introduce a new model for text matching called the Knowledge Enhanced Text
+Matching model (KETM), to enrich contextual representations with real-world
+common-sense knowledge from external knowledge sources to enhance our model
+understanding and reasoning. First, we use Wiktionary to retrieve the text word
+definitions as our external knowledge. Secondly, we feed text and knowledge to
+the text matching module to extract their feature vectors. The text matching
+module is used as an interaction module by integrating the encoder layer, the
+co-attention layer, and the aggregation layer. Specifically, the interaction
+process is iterated several times to obtain in-depth interaction information
+and extract the feature vectors of text and knowledge by multi-angle pooling.
+Then, we fuse text and knowledge using a gating mechanism to learn the ratio of
+text and knowledge fusion by a neural network that prevents noise generated by
+knowledge. After that, experimental validation on four datasets are carried
+out, and the experimental results show that our proposed model performs well on
+all four datasets, and the performance of our method is improved compared to
+the base model without adding external knowledge, which validates the
+effectiveness of our proposed method. The code is available at
+https://github.com/1094701018/KETM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Large Language Model Enhanced Conversational Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Feng, Shuchang Liu, Zhenghai Xue, Qingpeng Cai, Lantao Hu, Peng Jiang, Kun Gai, Fei Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommender systems (CRSs) aim to recommend high-quality items
+to users through a dialogue interface. It usually contains multiple sub-tasks,
+such as user preference elicitation, recommendation, explanation, and item
+information search. To develop effective CRSs, there are some challenges: 1)
+how to properly manage sub-tasks; 2) how to effectively solve different
+sub-tasks; and 3) how to correctly generate responses that interact with users.
+Recently, Large Language Models (LLMs) have exhibited an unprecedented ability
+to reason and generate, presenting a new opportunity to develop more powerful
+CRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to
+address the above challenges. For sub-task management, we leverage the
+reasoning ability of LLM to effectively manage sub-task. For sub-task solving,
+we collaborate LLM with expert models of different sub-tasks to achieve the
+enhanced performance. For response generation, we utilize the generation
+ability of LLM as a language interface to better interact with users.
+Specifically, LLMCRS divides the workflow into four stages: sub-task detection,
+model matching, sub-task execution, and response generation. LLMCRS also
+designs schema-based instruction, demonstration-based instruction, dynamic
+sub-task and model matching, and summary-based generation to instruct LLM to
+generate desired results in the workflow. Finally, to adapt LLM to
+conversational recommendations, we also propose to fine-tune LLM with
+reinforcement learning from CRSs performance feedback, referred to as RLPF.
+Experimental results on benchmark datasets show that LLMCRS with RLPF
+outperforms the existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning
+  to boost Foundation Modals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanglong Yao, Changyuan Tian, Jintao Liu, Zequn Zhang, Qing Liu, Li Jin, Shuchao Li, Xiaoyu Li, Xian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning ability is one of the most crucial capabilities of a foundation
+model, signifying its capacity to address complex reasoning tasks.
+Chain-of-Thought (CoT) technique is widely regarded as one of the effective
+methods for enhancing the reasoning ability of foundation models and has
+garnered significant attention. However, the reasoning process of CoT is
+linear, step-by-step, similar to personal logical reasoning, suitable for
+solving general and slightly complicated problems. On the contrary, the
+thinking pattern of an expert owns two prominent characteristics that cannot be
+handled appropriately in CoT, i.e., high-order multi-hop reasoning and
+multimodal comparative judgement. Therefore, the core motivation of this paper
+is transcending CoT to construct a reasoning paradigm that can think like an
+expert. The hyperedge of a hypergraph could connect various vertices, making it
+naturally suitable for modelling high-order relationships. Inspired by this,
+this paper innovatively proposes a multimodal Hypergraph-of-Thought (HoT)
+reasoning paradigm, which enables the foundation models to possess the
+expert-level ability of high-order multi-hop reasoning and multimodal
+comparative judgement. Specifically, a textual hypergraph-of-thought is
+constructed utilizing triple as the primary thought to model higher-order
+relationships, and a hyperedge-of-thought is generated through multi-hop
+walking paths to achieve multi-hop inference. Furthermore, we devise a visual
+hypergraph-of-thought to interact with the textual hypergraph-of-thought via
+Cross-modal Co-Attention Graph Learning for multimodal comparative
+verification. Experimentations on the ScienceQA benchmark demonstrate the
+proposed HoT-based T5 outperforms CoT-based GPT3.5 and chatGPT, which is on par
+with CoT-based GPT4 with a lower model size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Text Classification on Free Text Comments in
+  Patient-Reported Outcome Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna-Grace Linton, Vania Dimitrova, Amy Downing, Richard Wagland, Adam Glaser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Free text comments (FTC) in patient-reported outcome measures (PROMs) data
+are typically analysed using manual methods, such as content analysis, which is
+labour-intensive and time-consuming. Machine learning analysis methods are
+largely unsupervised, necessitating post-analysis interpretation. Weakly
+supervised text classification (WSTC) can be a valuable method of analysis to
+classify domain-specific text data in which there is limited labelled data. In
+this paper, we apply five WSTC techniques to FTC in PROMs data to identify
+health-related quality of life (HRQoL) themes reported by colorectal cancer
+patients. The WSTC methods label all the themes mentioned in the FTC. The
+results showed moderate performance on the PROMs data, mainly due to the
+precision of the models, and variation between themes. Evaluation of the
+classification performance illustrated the potential and limitations of keyword
+based WSTC to label PROMs FTC when labelled data is limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented at Health Text Analytics conference 2023 (UK)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Guest Nationality Composition from Hotel <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Gröger, Marc Pouly, Flavia Tinner, Leif Brandes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many hotels target guest acquisition efforts to specific markets in order to
+best anticipate individual preferences and needs of their guests. Likewise,
+such strategic positioning is a prerequisite for efficient marketing budget
+allocation. Official statistics report on the number of visitors from different
+countries, but no fine-grained information on the guest composition of
+individual businesses exists. There is, however, growing interest in such data
+from competitors, suppliers, researchers and the general public. We demonstrate
+how machine learning can be leveraged to extract references to guest
+nationalities from unstructured text reviews in order to dynamically assess and
+monitor the dynamics of guest composition of individual businesses. In
+particular, we show that a rather simple architecture of pre-trained embeddings
+and stacked LSTM layers provides a better performance-runtime tradeoff than
+more complex state-of-the-art language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task Conditioned <span class="highlight-title">BERT</span> for Joint Intent Detection and Slot-filling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diogo Tavares, Pedro Azevedo, David Semedo, Ricardo Sousa, João Magalhães
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue systems need to deal with the unpredictability of user intents to
+track dialogue state and the heterogeneity of slots to understand user
+preferences. In this paper we investigate the hypothesis that solving these
+challenges as one unified model will allow the transfer of parameter support
+data across the different tasks. The proposed principled model is based on a
+Transformer encoder, trained on multiple tasks, and leveraged by a rich input
+that conditions the model on the target inferences. Conditioning the
+Transformer encoder on multiple target inferences over the same corpus, i.e.,
+intent and multiple slot types, allows learning richer language interactions
+than a single-task model would be able to. In fact, experimental results
+demonstrate that conditioning the model on an increasing number of dialogue
+inference tasks leads to improved results: on the MultiWOZ dataset, the joint
+intent and slot detection can be improved by 3.2\% by conditioning on intent,
+10.8\% by conditioning on slot and 14.4\% by conditioning on both intent and
+slots. Moreover, on real conversations with Farfetch costumers, the proposed
+conditioned BERT can achieve high joint-goal and intent detection performance
+throughout a dialogue.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identification of the Relevance of Comments in Codes Using Bag of Words
+  and <span class="highlight-title">Transformer</span> Based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sruthi S, Tanmay Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Forum for Information Retrieval (FIRE) started a shared task this year
+for classification of comments of different code segments. This is binary text
+classification task where the objective is to identify whether comments given
+for certain code segments are relevant or not. The BioNLP-IISERB group at the
+Indian Institute of Science Education and Research Bhopal (IISERB) participated
+in this task and submitted five runs for five different models. The paper
+presents the overview of the models and other significant findings on the
+training corpus. The methods involve different feature engineering schemes and
+text classification techniques. The performance of the classical bag of words
+model and transformer-based models were explored to identify significant
+features from the given training corpus. We have explored different classifiers
+viz., random forest, support vector machine and logistic regression using the
+bag of words model. Furthermore, the pre-trained transformer based models like
+BERT, RoBERT and ALBERT were also used by fine-tuning them on the given
+training corpus. The performance of different such models over the training
+corpus were reported and the best five models were implemented on the given
+test corpus. The empirical results show that the bag of words model outperforms
+the transformer based models, however, the performance of our runs are not
+reasonably well in both training and test corpus. This paper also addresses the
+limitations of the models and scope for further improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Joint Speech-Text Representations Without Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cal Peyser, Zhong Meng, Ke Hu, Rohit Prabhavalkar, Andrew Rosenberg, Tara N. Sainath, Michael Picheny, Kyunghyun Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The last year has seen astonishing progress in text-prompted image generation
+premised on the idea of a cross-modal representation space in which the text
+and image domains are represented jointly. In ASR, this idea has found
+application as joint speech-text encoders that can scale to the capacities of
+very large parameter models by being trained on both unpaired speech and text.
+While these methods show promise, they have required special treatment of the
+sequence-length mismatch inherent in speech and text, either by up-sampling
+heuristics or an explicit alignment model. In this work, we offer evidence that
+joint speech-text encoders naturally achieve consistent representations across
+modalities by disregarding sequence length, and argue that consistency losses
+could forgive length differences and simply assume the best alignment. We show
+that such a loss improves downstream WER in both a large-parameter monolingual
+and multilingual system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lip2Vec: Efficient and Robust Visual Speech Recognition via
+  Latent-to-Latent Visual to Audio Representation Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasser Abdelaziz Dahou Djilali, Sanath Narayan, Haithem Boussaid, Ebtessam Almazrouei, Merouane Debbah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Speech Recognition (VSR) differs from the common perception tasks as
+it requires deeper reasoning over the video sequence, even by human experts.
+Despite the recent advances in VSR, current approaches rely on labeled data to
+fully train or finetune their models predicting the target speech. This hinders
+their ability to generalize well beyond the training set and leads to
+performance degeneration under out-of-distribution challenging scenarios.
+Unlike previous works that involve auxiliary losses or complex training
+procedures and architectures, we propose a simple approach, named Lip2Vec that
+is based on learning a prior model. Given a robust visual speech encoder, this
+network maps the encoded latent representations of the lip sequence to their
+corresponding latents from the audio pair, which are sufficiently invariant for
+effective text decoding. The generated audio representation is then decoded to
+text using an off-the-shelf Audio Speech Recognition (ASR) model. The proposed
+model compares favorably with fully-supervised learning methods on the LRS3
+dataset achieving 26 WER. Unlike SoTA approaches, our model keeps a reasonable
+performance on the VoxCeleb test set. We believe that reprogramming the VSR as
+an ASR task narrows the performance gap between the two and paves the way for
+more flexible formulations of lip reading.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Zero-Shot Text Matching for Financial Auditing with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Hillebrand, Armin Berger, Tobias Deußer, Tim Dilmaghani, Mohamed Khaled, Bernd Kliem, Rüdiger Loitz, Maren Pielka, David Leonhard, Christian Bauckhage, Rafet Sifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Auditing financial documents is a very tedious and time-consuming process. As
+of today, it can already be simplified by employing AI-based solutions to
+recommend relevant text passages from a report for each legal requirement of
+rigorous accounting standards. However, these methods need to be fine-tuned
+regularly, and they require abundant annotated data, which is often lacking in
+industrial environments. Hence, we present ZeroShotALI, a novel recommender
+system that leverages a state-of-the-art large language model (LLM) in
+conjunction with a domain-specifically optimized transformer-based
+text-matching solution. We find that a two-step approach of first retrieving a
+number of best matching document sections per legal requirement with a custom
+BERT-based model and second filtering these selections using an LLM yields
+significant performance improvements over existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Conversation Models and How to Rein Them in: A <span class="highlight-title">Survey</span> of Failures
+  and Fixes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Galetzka, Anne Beyer, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent conditional language models are able to continue any kind of text
+source in an often seemingly fluent way. This fact encouraged research in the
+area of open-domain conversational systems that are based on powerful language
+models and aim to imitate an interlocutor by generating appropriate
+contributions to a written dialogue. From a linguistic perspective, however,
+the complexity of contributing to a conversation is high. In this survey, we
+interpret Grice's maxims of cooperative conversation from the perspective of
+this specific research area and systematize the literature under the aspect of
+what makes a contribution appropriate: A neural conversation model has to be
+fluent, informative, consistent, coherent, and follow social norms. In order to
+ensure these qualities, recent approaches try to tame the underlying language
+models at various intervention points, such as data, training regime or
+decoding. Sorted by these categories and intervention points, we discuss
+promising attempts and suggest novel ways for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Represents the state of the field in 2022; partially based on the
+  first authors 2022 PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fly-Swat or Cannon? Cost-Effective Language Model Choice via
+  Meta-Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marija Šakota, Maxime Peyrard, Robert West
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative language models (LMs) have become omnipresent across data science.
+For a wide variety of tasks, inputs can be phrased as natural language prompts
+for an LM, from whose output the solution can then be extracted. LM performance
+has consistently been increasing with model size - but so has the monetary cost
+of querying the ever larger models. Importantly, however, not all inputs are
+equally hard: some require larger LMs for obtaining a satisfactory solution,
+whereas for others smaller LMs suffice. Based on this fact, we design a
+framework for Cost-Effective Language Model Choice (CELMOC). Given a set of
+inputs and a set of candidate LMs, CELMOC judiciously assigns each input to an
+LM predicted to do well on the input according to a so-called meta-model,
+aiming to achieve high overall performance at low cost. The cost-performance
+trade-off can be flexibly tuned by the user. Options include, among others,
+maximizing total expected performance (or the number of processed inputs) while
+staying within a given cost budget, or minimizing total cost while processing
+all inputs. We evaluate CELMOC on 14 datasets covering five natural language
+tasks, using four candidate LMs of vastly different size and cost. With CELMOC,
+we match the performance of the largest available LM while achieving a cost
+reduction of 63%. Via our publicly available library, researchers as well as
+practitioners can thus save large amounts of money without sacrificing
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Case Study on Context Encoding in Multi-Encoder based Document-Level
+  Neural Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramakrishna Appicharla, Baban Gain, Santanu Pal, Asif Ekbal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have shown that the multi-encoder models are agnostic to the
+choice of context, and the context encoder generates noise which helps improve
+the models in terms of BLEU score. In this paper, we further explore this idea
+by evaluating with context-aware pronoun translation test set by training
+multi-encoder models trained on three different context settings viz, previous
+two sentences, random two sentences, and a mix of both as context.
+Specifically, we evaluate the models on the ContraPro test set to study how
+different contexts affect pronoun translation accuracy. The results show that
+the model can perform well on the ContraPro test set even when the context is
+random. We also analyze the source representations to study whether the context
+encoder generates noise. Our analysis shows that the context encoder provides
+sufficient information to learn discourse-level information. Additionally, we
+observe that mixing the selected context (the previous two sentences in this
+case) and the random context is generally better than the other settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MT Summit 2023 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Guide Human Experts via Personalized Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debodeep Banerjee, Stefano Teso, Andrea Passerini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In learning to defer, a predictor identifies risky decisions and defers them
+to a human expert. One key issue with this setup is that the expert may end up
+over-relying on the machine's decisions, due to anchoring bias. At the same
+time, whenever the machine chooses the deferral option the expert has to take
+decisions entirely unassisted. As a remedy, we propose learning to guide (LTG),
+an alternative framework in which -- rather than suggesting ready-made
+decisions -- the machine provides guidance useful to guide decision-making, and
+the human is entirely responsible for coming up with a decision. We also
+introduce SLOG, an LTG implementation that leverages (a small amount of) human
+supervision to convert a generic large language model into a module capable of
+generating textual guidance, and present preliminary but promising results on a
+medical diagnosis task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large
+  Language Models During Predictive Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Kewenig, Christopher Edwards, Quitterie Lacome DEstalenx, Akilles Rechardt, Jeremy I Skipper, Gabriella Vigliocco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advanced language processing abilities of large language models (LLMs)
+have stimulated debate over their capacity to replicate human-like cognitive
+processes. One differentiating factor between language processing in LLMs and
+humans is that language input is often grounded in more than one perceptual
+modality, whereas most LLMs process solely text-based information. Multimodal
+grounding allows humans to integrate - e.g. visual context with linguistic
+information and thereby place constraints on the space of upcoming words,
+reducing cognitive load and improving perception and comprehension. Recent
+multimodal LLMs (mLLMs) combine visual and linguistic embedding spaces with a
+transformer type attention mechanism for next-word prediction. To what extent
+does predictive language processing based on multimodal input align in mLLMs
+and humans? To answer this question, 200 human participants watched short
+audio-visual clips and estimated the predictability of an upcoming verb or
+noun. The same clips were processed by the mLLM CLIP, with predictability
+scores based on a comparison of image and text feature vectors. Eye-tracking
+was used to estimate what visual features participants attended to, and CLIP's
+visual attention weights were recorded. We find that human estimates of
+predictability align significantly with CLIP scores, but not for a unimodal LLM
+of comparable parameter size. Further, alignment vanished when CLIP's visual
+attention weights were perturbed, and when the same input was fed to a
+multimodal model without attention. Analysing attention patterns, we find a
+significant spatial overlap between CLIP's visual attention weights and human
+eye-tracking data. Results suggest that comparable processes of integrating
+multimodal information, guided by attention to relevant visual features,
+supports predictive language processing in mLLMs and humans.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures, submitted to journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models in Cryptocurrency Securities Cases: Can Chat<span class="highlight-title">GPT</span>
+  Replace Lawyers? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arianna Trozze, Toby Davies, Bennett Kleinberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) could enhance access to the legal system.
+However, empirical research on their effectiveness in conducting legal tasks is
+scant. We study securities cases involving cryptocurrencies as one of numerous
+contexts where AI could support the legal process, studying LLMs' legal
+reasoning and drafting capabilities. We examine whether a) an LLM can
+accurately determine which laws are potentially being violated from a fact
+pattern, and b) whether there is a difference in juror decision-making based on
+complaints written by a lawyer compared to an LLM. We feed fact patterns from
+real-life cases to GPT-3.5 and evaluate its ability to determine correct
+potential violations from the scenario and exclude spurious violations. Second,
+we had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's
+legal reasoning skills proved weak, though we expect improvement in future
+models, particularly given the violations it suggested tended to be correct (it
+merely missed additional, correct violations). GPT-3.5 performed better at
+legal drafting, and jurors' decisions were not statistically significantly
+associated with the author of the document upon which they based their
+decisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks,
+they would be unable to replace lawyers at this stage. However, their drafting
+skills (though, perhaps, still inferior to lawyers), could provide access to
+justice for more individuals by reducing the cost of legal services. Our
+research is the first to systematically study LLMs' legal drafting and
+reasoning capabilities in litigation, as well as in securities law and
+cryptocurrency-related misconduct.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing <span class="highlight-title">transformer</span>-based machine translation model for single GPU
+  training: a hyperparameter ablation study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luv Verma, Ketaki N. Kolhatkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine translation tasks, the relationship between model complexity and
+performance is often presumed to be linear, driving an increase in the number
+of parameters and consequent demands for computational resources like multiple
+GPUs. To explore this assumption, this study systematically investigates the
+effects of hyperparameters through ablation on a sequence-to-sequence machine
+translation pipeline, utilizing a single NVIDIA A100 GPU. Contrary to
+expectations, our experiments reveal that combinations with the most parameters
+were not necessarily the most effective. This unexpected insight prompted a
+careful reduction in parameter sizes, uncovering "sweet spots" that enable
+training sophisticated models on a single GPU without compromising translation
+quality. The findings demonstrate an intricate relationship between
+hyperparameter selection, model size, and computational resource needs. The
+insights from this study contribute to the ongoing efforts to make machine
+translation more accessible and cost-effective, emphasizing the importance of
+precise hyperparameter tuning over mere scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures, 1 Table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tweet Sentiment Extraction using Viterbi Algorithm with Transfer
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zied Baklouti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tweet sentiment extraction extracts the most significant portion of the
+sentence, determining whether the sentiment is positive or negative. This
+research aims to identify the part of tweet sentences that strikes any emotion.
+To reach this objective, we continue improving the Viterbi algorithm previously
+modified by the author to make it able to receive pre-trained model parameters.
+We introduce the confidence score and vector as two indicators responsible for
+evaluating the model internally before assessing the final results. We then
+present a method to fine-tune this nonparametric model. We found that the model
+gets highly explainable as the confidence score vector reveals precisely where
+the least confidence predicted states are and if the modifications approved
+ameliorate the confidence score or if the tuning is going in the wrong
+direction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LittleMu: Deploying an Online Virtual Teaching Assistant via
+  Heterogeneous Sources Integration and Chain of Teach <span class="highlight-title">Prompt</span>s <span class="chip">CIKM 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangqing Tu, Zheyuan Zhang, Jifan Yu, Chunyang Li, Siyu Zhang, Zijun Yao, Lei Hou, Juanzi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teaching assistants have played essential roles in the long history of
+education. However, few MOOC platforms are providing human or virtual teaching
+assistants to support learning for massive online students due to the
+complexity of real-world online education scenarios and the lack of training
+data. In this paper, we present a virtual MOOC teaching assistant, LittleMu
+with minimum labeled training data, to provide question answering and chit-chat
+services. Consisting of two interactive modules of heterogeneous retrieval and
+language model prompting, LittleMu first integrates structural, semi- and
+unstructured knowledge sources to support accurate answers for a wide range of
+questions. Then, we design delicate demonstrations named "Chain of Teach"
+prompts to exploit the large-scale pre-trained model to handle complex
+uncollected questions. Except for question answering, we develop other
+educational services such as knowledge-grounded chit-chat. We test the system's
+performance via both offline evaluation and online deployment. Since May 2020,
+our LittleMu system has served over 80,000 users with over 300,000 queries from
+over 500 courses on XuetangX MOOC platform, which continuously contributes to a
+more convenient and fair education. Our code, services, and dataset will be
+available at https://github.com/THU-KEG/VTA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, Accepted by CIKM 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PIPPA: A Partially Synthetic Conversational <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tear Gosling, Alpin Dale, Yinhe Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of increasingly powerful large language models, there is a
+burgeoning interest in leveraging these models for casual conversation and
+role-play applications. However, existing conversational and role-playing
+datasets often fail to capture the diverse and nuanced interactions typically
+exhibited by real-world role-play participants. To address this limitation and
+contribute to the rapidly growing field, we introduce a partially-synthetic
+dataset named PIPPA (Personal Interaction Pairs between People and AI). PIPPA
+is a result of a community-driven crowdsourcing effort involving a group of
+role-play enthusiasts. The dataset comprises over 1 million utterances that are
+distributed across 26,000 conversation sessions and provides a rich resource
+for researchers and AI developers to explore and refine conversational AI
+systems in the context of role-play scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Planning with a LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gautier Dagan, Frank Keller, Alex Lascarides
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) can solve many NLP tasks in zero-shot
+settings, applications involving embodied agents remain problematic. In
+particular, complex plans that require multi-step reasoning become difficult
+and too costly as the context window grows. Planning requires understanding the
+likely effects of one's actions and identifying whether the current environment
+satisfies the goal state. While symbolic planners find optimal solutions
+quickly, they require a complete and accurate representation of the planning
+problem, severely limiting their use in practical scenarios. In contrast,
+modern LLMs cope with noisy observations and high levels of uncertainty when
+reasoning about a task. Our work presents LLM Dynamic Planner (LLM-DP): a
+neuro-symbolic framework where an LLM works hand-in-hand with a traditional
+planner to solve an embodied task. Given action-descriptions, LLM-DP solves
+Alfworld faster and more efficiently than a naive LLM ReAct baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZYN: Zero-Shot Reward Models with Yes-No Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Gallego
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address the problem of directing the text generations of a
+LLM towards a desired behavior, aligning the generated text with the
+preferences of the human operator. We propose using another language model as a
+critic, reward model in a zero-shot way thanks to the prompt of a Yes-No
+question that represents the user preferences, without requiring further
+labeled data. This zero-shot reward model provides the learning signal to
+further fine-tune the base LLM using reinforcement learning, as in RLAIF; yet
+our approach is also compatible in other contexts such as quality-diversity
+search. Extensive evidence of the capabilities of the proposed ZYN framework is
+provided through experiments in different domains related to text generation,
+including detoxification; optimizing sentiment of movie reviews, or any other
+attribute; steering the opinion about a particular topic the model may have;
+and personalizing prompt generators for text-to-image tasks. Code to be
+released at \url{https://github.com/vicgalle/zero-shot-reward-models/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models and Knowledge Graphs: Opportunities and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeff Z. Pan, Simon Razniewski, Jan-Christoph Kalo, Sneha Singhania, Jiaoyan Chen, Stefan Dietze, Hajira Jabeen, Janna Omeliyanenko, Wen Zhang, Matteo Lissandrini, Russa Biswas, Gerard de Melo, Angela Bonifati, Edlira Vakaj, Mauro Dragoni, Damien Graux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have taken Knowledge Representation -- and the
+world -- by storm. This inflection point marks a shift from explicit knowledge
+representation to a renewed focus on the hybrid representation of both explicit
+knowledge and parametric knowledge. In this position paper, we will discuss
+some of the common debate points within the community on LLMs (parametric
+knowledge) and Knowledge Graphs (explicit knowledge) and speculate on
+opportunities and visions that the renewed focus brings, as well as related
+research topics and challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models to Identify Social Determinants of Health in
+  Electronic Health Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Guevara, Shan Chen, Spencer Thomas, Tafadzwa L. Chaunzwa, Idalid Franco, Benjamin Kann, Shalini Moningi, Jack Qian, Madeleine Goldstein, Susan Harper, Hugo JWL Aerts, Guergana K. Savova, Raymond H. Mak, Danielle S. Bitterman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social determinants of health (SDoH) have an important impact on patient
+outcomes but are incompletely collected from the electronic health records
+(EHR). This study researched the ability of large language models to extract
+SDoH from free text in EHRs, where they are most commonly documented, and
+explored the role of synthetic clinical text for improving the extraction of
+these scarcely documented, yet extremely valuable, clinical data. 800 patient
+notes were annotated for SDoH categories, and several transformer-based models
+were evaluated. The study also experimented with synthetic data generation and
+assessed for algorithmic bias. Our best-performing models were fine-tuned
+Flan-T5 XL (macro-F1 0.71) for any SDoH, and Flan-T5 XXL (macro-F1 0.70). The
+benefit of augmenting fine-tuning with synthetic data varied across model
+architecture and size, with smaller Flan-T5 models (base and large) showing the
+greatest improvements in performance (delta F1 +0.12 to +0.23). Model
+performance was similar on the in-hospital system dataset but worse on the
+MIMIC-III dataset. Our best-performing fine-tuned models outperformed zero- and
+few-shot performance of ChatGPT-family models for both tasks. These fine-tuned
+models were less likely than ChatGPT to change their prediction when
+race/ethnicity and gender descriptors were added to the text, suggesting less
+algorithmic bias (p<0.05). At the patient-level, our models identified 93.8% of
+patients with adverse SDoH, while ICD-10 codes captured 2.0%. Our method can
+effectively extracted SDoH information from clinic notes, performing better
+compare to GPT zero- and few-shot settings. These models could enhance
+real-world evidence on SDoH and aid in identifying patients needing social
+support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 5 figures, 5 tables in main, submitted for review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bilingual Streaming ASR with Grapheme units and Auxiliary Monolingual
+  Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Soleymanpour, Mahmoud Al Ismail, Fahimeh Bahmaninezhad, Kshitiz Kumar, Jian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a bilingual solution to support English as secondary locale for
+most primary locales in hybrid automatic speech recognition (ASR) settings. Our
+key developments constitute: (a) pronunciation lexicon with grapheme units
+instead of phone units, (b) a fully bilingual alignment model and subsequently
+bilingual streaming transformer model, (c) a parallel encoder structure with
+language identification (LID) loss, (d) parallel encoder with an auxiliary loss
+for monolingual projections. We conclude that in comparison to LID loss, our
+proposed auxiliary loss is superior in specializing the parallel encoders to
+respective monolingual locales, and that contributes to stronger bilingual
+learning. We evaluate our work on large-scale training and test tasks for
+bilingual Spanish (ES) and bilingual Italian (IT) applications. Our bilingual
+models demonstrate strong English code-mixing capability. In particular, the
+bilingual IT model improves the word error rate (WER) for a code-mix IT task
+from 46.5% to 13.8%, while also achieving a close parity (9.6%) with the
+monolingual IT model (9.5%) over IT tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RT-1: Robotics <span class="highlight-title">Transformer</span> for Real-World Control at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Brohan, Noah Brown, Justice Carbajal, Yevgen Chebotar, Joseph Dabis, Chelsea Finn, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Tomas Jackson, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Isabel Leal, Kuang-Huei Lee, Sergey Levine, Yao Lu, Utsav Malla, Deeksha Manjunath, Igor Mordatch, Ofir Nachum, Carolina Parada, Jodilyn Peralta, Emily Perez, Karl Pertsch, Jornell Quiambao, Kanishka Rao, Michael Ryoo, Grecia Salazar, Pannag Sanketi, Kevin Sayed, Jaspiar Singh, Sumedh Sontakke, Austin Stone, Clayton Tan, Huong Tran, Vincent Vanhoucke, Steve Vega, Quan Vuong, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Tianhe Yu, Brianna Zitkovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By transferring knowledge from large, diverse, task-agnostic datasets, modern
+machine learning models can solve specific downstream tasks either zero-shot or
+with small task-specific datasets to a high level of performance. While this
+capability has been demonstrated in other fields such as computer vision,
+natural language processing or speech recognition, it remains to be shown in
+robotics, where the generalization capabilities of the models are particularly
+critical due to the difficulty of collecting real-world robotic data. We argue
+that one of the keys to the success of such general robotic models lies with
+open-ended task-agnostic training, combined with high-capacity architectures
+that can absorb all of the diverse, robotic data. In this paper, we present a
+model class, dubbed Robotics Transformer, that exhibits promising scalable
+model properties. We verify our conclusions in a study of different model
+classes and their ability to generalize as a function of the data size, model
+size, and data diversity based on a large-scale data collection on real robots
+performing real-world tasks. The project's website and videos can be found at
+robotics-transformer1.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See website at robotics-transformer1.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ML-SUPERB: Multilingual Speech Universal PERformance Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiatong Shi, Dan Berrebbi, William Chen, Ho-Lam Chung, En-Pei Hu, Wei Ping Huang, Xuankai Chang, Shang-Wen Li, Abdelrahman Mohamed, Hung-yi Lee, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech processing Universal PERformance Benchmark (SUPERB) is a leaderboard
+to benchmark the performance of Self-Supervised Learning (SSL) models on
+various speech processing tasks. However, SUPERB largely considers English
+speech in its evaluation. This paper presents multilingual SUPERB (ML-SUPERB),
+covering 143 languages (ranging from high-resource to endangered), and
+considering both automatic speech recognition and language identification.
+Following the concept of SUPERB, ML-SUPERB utilizes frozen SSL features and
+employs a simple framework for multilingual tasks by learning a shallow
+downstream model. Similar to the SUPERB benchmark, we find speech SSL models
+can significantly improve performance compared to FBANK features. Furthermore,
+we find that multilingual models do not always perform better than their
+monolingual counterparts. We will release ML-SUPERB as a challenge with
+organized datasets and reproducible training scripts for future multilingual
+representation research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Interspeech</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic
+  Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luka Terčon, Nikola Ljubešić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of
+the South Slavic languages, which is based on the Stanza natural language
+processing pipeline. We describe the main improvements in CLASSLA-Stanza with
+respect to Stanza, and give a detailed description of the model training
+process for the latest 2.1 release of the pipeline. We also report performance
+scores produced by the pipeline for different languages and varieties.
+CLASSLA-Stanza exhibits consistently high performance across all the supported
+languages and outperforms or expands its parent pipeline Stanza at all the
+supported tasks. We also present the pipeline's new functionality enabling
+efficient processing of web data and the reasons that led to its
+implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 tables, 1 figure; Typos corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ontology Enrichment from Texts: A Biomedical <span class="highlight-title">Dataset</span> for Concept
+  Discovery and Placement <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14704v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14704v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Dong, Jiaoyan Chen, Yuan He, Ian Horrocks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mentions of new concepts appear regularly in texts and require automated
+approaches to harvest and place them into Knowledge Bases (KB), e.g.,
+ontologies and taxonomies. Existing datasets suffer from three issues, (i)
+mostly assuming that a new concept is pre-discovered and cannot support
+out-of-KB mention discovery; (ii) only using the concept label as the input
+along with the KB and thus lacking the contexts of a concept label; and (iii)
+mostly focusing on concept placement w.r.t a taxonomy of atomic concepts,
+instead of complex concepts, i.e., with logical operators. To address these
+issues, we propose a new benchmark, adapting MedMentions dataset (PubMed
+abstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases
+sub-category and the broader categories of Clinical finding, Procedure, and
+Pharmaceutical / biologic product. We provide usage on the evaluation with the
+dataset for out-of-KB mention discovery and concept placement, adapting recent
+Large Language Model based methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, accepted for CIKM 2023. The dataset, data
+  construction scripts, and baseline implementation are available at
+  https://zenodo.org/record/8228005 (Zenodo) and
+  https://github.com/KRR-Oxford/OET (GitHub)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity
+  Linking <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Dong, Jiaoyan Chen, Yuan He, Yinan Liu, Ian Horrocks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering entity mentions that are out of a Knowledge Base (KB) from texts
+plays a critical role in KB maintenance, but has not yet been fully explored.
+The current methods are mostly limited to the simple threshold-based approach
+and feature-based classification, and the datasets for evaluation are
+relatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL)
+method which can identify mentions that do not have corresponding KB entities
+by matching them to a special NIL entity. To better utilize BERT, we propose
+new techniques including NIL entity representation and classification, with
+synonym enhancement. We also apply KB Pruning and Versioning strategies to
+automatically construct out-of-KB datasets from common in-KB EL datasets.
+Results on five datasets of clinical notes, biomedical publications, and
+Wikipedia articles in various domains show the advantages of BLINKout over
+existing methods to identify out-of-KB mentions for the medical ontologies,
+UMLS, SNOMED CT, and the general KB, WikiData.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures, accepted for CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modal Contrastive Learning for Multimodal Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14057v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14057v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longzheng Wang, Chuang Zhang, Hongbo Xu, Yongxiu Xu, Xiaohan Xu, Siqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection of multimodal fake news has gained a widespread attention
+recently. Many existing approaches seek to fuse unimodal features to produce
+multimodal news representations. However, the potential of powerful cross-modal
+contrastive learning methods for fake news detection has not been well
+exploited. Besides, how to aggregate features from different modalities to
+boost the performance of the decision-making process is still an open question.
+To address that, we propose COOLANT, a cross-modal contrastive learning
+framework for multimodal fake news detection, aiming to achieve more accurate
+image-text alignment. To further improve the alignment precision, we leverage
+an auxiliary task to soften the loss term of negative samples during the
+contrast process. A cross-modal fusion module is developed to learn the
+cross-modality correlations. An attention mechanism with an attention guidance
+module is implemented to help effectively and interpretably aggregate the
+aligned unimodal representations and the cross-modality correlations. Finally,
+we evaluate the COOLANT and conduct a comparative study on two widely used
+datasets, Twitter and Weibo. The experimental results demonstrate that our
+COOLANT outperforms previous approaches by a large margin and achieves new
+state-of-the-art results on the two datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D-EX : A Unified <span class="highlight-title">Dataset</span> of Definitions and Dictionary Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemah Almeman, Hadi Sheikhi, Luis Espinosa-Anke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Definitions are a fundamental building block in lexicography, linguistics and
+computational semantics. In NLP, they have been used for retrofitting word
+embeddings or augmenting contextual representations in language models.
+However, lexical resources containing definitions exhibit a wide range of
+properties, which has implications in the behaviour of models trained and
+evaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to
+fill this gap by combining well-known English resources into one centralized
+knowledge repository in the form of <term, definition, example> triples. 3D- EX
+is a unified evaluation framework with carefully pre-computed
+train/validation/test splits to prevent memorization. We report experimental
+results that suggest that this dataset could be effectively leveraged in
+downstream NLP tasks. Code and data are available at
+https://github.com/F-Almeman/3D-EX .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages (including references pages), 9 tables, and 1 figure. This
+  paper is submitted to RANLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>s are Short Text Classifiers: A Study of Inductive Short Text
+  Classifiers on Benchmarks and Real-world <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16878v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16878v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Karl, Ansgar Scherp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short text classification is a crucial and challenging aspect of Natural
+Language Processing. For this reason, there are numerous highly specialized
+short text classifiers. However, in recent short text research, State of the
+Art (SOTA) methods for traditional text classification, particularly the pure
+use of Transformers, have been unexploited. In this work, we examine the
+performance of a variety of short text classifiers as well as the top
+performing traditional text classifier. We further investigate the effects on
+two new real-world short text datasets in an effort to address the issue of
+becoming overly dependent on benchmark datasets with a limited number of
+characteristics. Our experiments unambiguously demonstrate that Transformers
+achieve SOTA accuracy on short text classification tasks, raising the question
+of whether specialized short text techniques are necessary.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CD-MAKE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constraining Linear-chain CRFs to Regular Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.07306v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.07306v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Papay, Roman Klinger, Sebastian Padó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in structured prediction is to represent the
+interdependencies within output structures. When outputs are structured as
+sequences, linear-chain conditional random fields (CRFs) are a widely used
+model class which can learn \textit{local} dependencies in the output. However,
+the CRF's Markov assumption makes it impossible for CRFs to represent
+distributions with \textit{nonlocal} dependencies, and standard CRFs are unable
+to respect nonlocal constraints of the data (such as global arity constraints
+on output labels). We present a generalization of CRFs that can enforce a broad
+class of constraints, including nonlocal ones, by specifying the space of
+possible output structures as a regular language $\mathcal{L}$. The resulting
+regular-constrained CRF (RegCCRF) has the same formal properties as a standard
+CRF, but assigns zero probability to all label sequences not in $\mathcal{L}$.
+Notably, RegCCRFs can incorporate their constraints during training, while
+related models only enforce constraints during decoding. We prove that
+constrained training is never worse than constrained decoding, and show
+empirically that it can be substantially better in practice. Additionally, we
+demonstrate a practical benefit on downstream tasks by incorporating a RegCCRF
+into a deep neural model for semantic role labeling, exceeding state-of-the-art
+results on a standard dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalised Language Modelling of Screen Characters Using Rich Metadata
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16618v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16618v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Vincent, Rowanne Sumner, Alice Dowek, Charlotte Blundell, Emily Preston, Chris Bayliss, Chris Oakley, Carolina Scarton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models that are sensitive to external context can more effectively
+capture the speaking patterns of individuals with specific characteristics or
+in particular environments. However, obtaining and leveraging such annotations
+can be challenging. In this work, we show how to leverage rich character and
+film annotations to personalise language models in a scalable manner. Our best
+model can reduce perplexity by up to 6.5% compared to a parameter-matched
+language model. Our approach performs on par with speaker-specific fine-tuning
+when the fine-tuning data (i.e. past dialogue) for individual speakers is
+available. On top of that, it also generalises well to a scenario with no such
+data, relying on combinations of demographic characteristics expressed via
+metadata. Our findings are consistent across two corpora, one of which is also
+a contribution of this paper: Cornell-rich contains rich manual annotations for
+863 speaking characters from the Cornell Movie Dialog Corpus, including
+features such as characteristic quotes and character descriptions, along with
+six automatically extracted metadata features for over 95% of the featured
+films. Finally, we also present a cost-benefit analysis highlighting which
+annotations are most cost-effective in reducing perplexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages; 4 figures; 6 tables. Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verifying the Robustness of Automatic Credibility Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Przybyła, Alexander Shvets, Horacio Saggion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text classification methods have been widely investigated as a way to detect
+content of low credibility: fake news, social media bots, propaganda, etc.
+Quite accurate models (likely based on deep neural networks) help in moderating
+public electronic platforms and often cause content creators to face rejection
+of their submissions or removal of already published texts. Having the
+incentive to evade further detection, content creators try to come up with a
+slightly modified version of the text (known as an attack with an adversarial
+example) that exploit the weaknesses of classifiers and result in a different
+output. Here we systematically test the robustness of popular text classifiers
+against available attacking techniques and discover that, indeed, in some cases
+insignificant changes in input text can mislead the models. We also introduce
+BODEGA: a benchmark for testing both victim models and attack methods on four
+misinformation detection tasks in an evaluation framework designed to simulate
+real use-cases of content moderation. Finally, we manually analyse a subset
+adversarial examples and check what kinds of modifications are used in
+successful attacks. The BODEGA code and data is openly shared in hope of
+enhancing the comparability and replicability of further research in this area
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid
+  Essay in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12267v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12267v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Zeng, Lele Sha, Yuheng Li, Kaixun Yang, Dragan Gašević, Guanliang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent large language models (LLMs), e.g., ChatGPT, have been able to
+generate human-like and fluent responses when provided with specific
+instructions. While admitting the convenience brought by technological
+advancement, educators also have concerns that students might leverage LLMs to
+complete their writing assignments and pass them off as their original work.
+Although many AI content detection studies have been conducted as a result of
+such concerns, most of these prior studies modeled AI content detection as a
+classification problem, assuming that a text is either entirely human-written
+or entirely AI-generated. In this study, we investigated AI content detection
+in a rarely explored yet realistic setting where the text to be detected is
+collaboratively written by human and generative LLMs (i.e., hybrid text). We
+first formalized the detection task as identifying the transition points
+between human-written content and AI-generated content from a given hybrid text
+(boundary detection). Then we proposed a two-step approach where we (1)
+separated AI-generated content from human-written content during the encoder
+training process; and (2) calculated the distances between every two adjacent
+prototypes and assumed that the boundaries exist between the two adjacent
+prototypes that have the furthest distance from each other. Through extensive
+experiments, we observed the following main findings: (1) the proposed approach
+consistently outperformed the baseline methods across different experiment
+settings; (2) the encoder training process can significantly boost the
+performance of the proposed approach; (3) when detecting boundaries for
+single-boundary hybrid essays, the proposed approach could be enhanced by
+adopting a relatively large prototype size, leading to a 22% improvement in the
+In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages including references, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings Using Adapters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schopf, Dennis N. Schneider, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentence embeddings enable us to capture the semantic similarity of short
+texts. Most sentence embedding models are trained for general semantic textual
+similarity (STS) tasks. Therefore, to use sentence embeddings in a particular
+domain, the model must be adapted to it in order to achieve good results.
+Usually, this is done by fine-tuning the entire sentence embedding model for
+the domain of interest. While this approach yields state-of-the-art results,
+all of the model's weights are updated during fine-tuning, making this method
+resource-intensive. Therefore, instead of fine-tuning entire sentence embedding
+models for each target domain individually, we propose to train lightweight
+adapters. These domain-specific adapters do not require fine-tuning all
+underlying sentence embedding model parameters. Instead, we only train a small
+number of additional parameters while keeping the weights of the underlying
+sentence embedding model fixed. Training domain-specific adapters allows always
+using the same base model and only exchanging the domain-specific adapters to
+adapt sentence embeddings to a specific domain. We show that using adapters for
+parameter-efficient domain adaptation of sentence embeddings yields competitive
+performance within 1% of a domain-adapted, entirely fine-tuned sentence
+embedding model while only training approximately 3.6% of the parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 14th International Conference on Recent Advances in
+  Natural Language Processing (RANLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM As DBA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Guoliang Li, Zhiyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Database administrators (DBAs) play a crucial role in managing, maintaining
+and optimizing a database system to ensure data availability, performance, and
+reliability. However, it is hard and tedious for DBAs to manage a large number
+of database instances (e.g., millions of instances on the cloud databases).
+Recently large language models (LLMs) have shown great potential to understand
+valuable documents and accordingly generate reasonable answers. Thus, we
+propose D-Bot, a LLM-based database administrator that can continuously acquire
+database maintenance experience from textual sources, and provide reasonable,
+well-founded, in-time diagnosis and optimization advice for target databases.
+This paper presents a revolutionary LLM-centric framework for database
+maintenance, including (i) database maintenance knowledge detection from
+documents and tools, (ii) tree of thought reasoning for root cause analysis,
+and (iii) collaborative diagnosis among multiple LLMs. Our preliminary
+experimental results that D-Bot can efficiently and effectively diagnose the
+root causes and our code is available at
+github.com/TsinghuaDatabaseGroup/DB-GPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating the Generation Capabilities of Large Chinese Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Zeng, Jingyuan Xue, Meng Hao, Chen Sun, Bin Ning, Na Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CG-Eval, the first comprehensive evaluation of the
+generation capabilities of large Chinese language models across a wide range of
+academic disciplines. The models' performance was assessed based on their
+ability to generate accurate and relevant responses to different types of
+questions in six disciplines, namely, Science and Engineering, Humanities and
+Social Sciences, Mathematical Calculations, Medical Practitioner Qualification
+Examination, Judicial Examination, and Certified Public Accountant Examination.
+This paper also presents Gscore, a composite index derived from the weighted
+sum of multiple metrics to measure the quality of model's generation against a
+reference. The test data and test results can be found at
+http://cgeval.besteasy.com/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojie Pan, Zepeng Zhai, Yuzhou Zhang, Ruiji Fu, Ming Liu, Yangqiu Song, Zhongyuan Wang, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online encyclopedias, such as Wikipedia, have been well-developed and
+researched in the last two decades. One can find any attributes or other
+information of a wiki item on a wiki page edited by a community of volunteers.
+However, the traditional text, images and tables can hardly express some
+aspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may
+care more about ``How to feed it'' or ``How to train it not to protect its
+food''. Currently, short-video platforms have become a hallmark in the online
+world. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,
+short-video apps have changed how we consume and create content today. Except
+for producing short videos for entertainment, we can find more and more authors
+sharing insightful knowledge widely across all walks of life. These short
+videos, which we call knowledge videos, can easily express any aspects (e.g.
+hair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and
+they can be systematically analyzed and organized like an online encyclopedia.
+In this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia
+consisting of items, aspects, and short videos lined to them, which was
+extracted from billions of videos of Kuaishou (Kwai), a well-known short-video
+platform in China. We first collected items from multiple sources and mined
+user-centered aspects from millions of users' queries to build an item-aspect
+tree. Then we propose a new task called ``multi-modal item-aspect linking'' as
+an expansion of ``entity linking'' to link short videos into item-aspect pairs
+and build the whole short-video encyclopedia. Intrinsic evaluations show that
+our encyclopedia is of large scale and highly accurate. We also conduct
+sufficient extrinsic experiments to show how Kuaipedia can help fundamental
+applications such as entity typing and entity linking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Machine Learning and <span class="highlight-title">Transformer</span>-based Approaches for
+  Deceptive Text Classification: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deceptive text classification is a critical task in natural language
+processing that aims to identify deceptive o fraudulent content. This study
+presents a comparative analysis of machine learning and transformer-based
+approaches for deceptive text classification. We investigate the effectiveness
+of traditional machine learning algorithms and state-of-the-art transformer
+models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive
+text. A labeled dataset consisting of deceptive and non-deceptive texts is used
+for training and evaluation purposes. Through extensive experimentation, we
+compare the performance metrics, including accuracy, precision, recall, and F1
+score, of the different approaches. The results of this study shed light on the
+strengths and limitations of machine learning and transformer-based methods for
+deceptive text classification, enabling researchers and practitioners to make
+informed decisions when dealing with deceptive content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Generalist Foundation Model for Radiology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we aim to initiate the development of Radiology Foundation
+Model, termed as RadFM.We consider the construction of foundational models from
+the perspectives of data, model design, and evaluation thoroughly. Our
+contribution can be concluded as follows: (i), we construct a large-scale
+Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.
+To the best of our knowledge, this is the first multi-modal dataset containing
+3D medical scans. (ii), We propose an architecture that enables visually
+conditioned generative pre-training, allowing for the integration of text input
+interleaved with 2D or 3D medical scans to generate response for diverse
+radiologic tasks. The model was initially pre-trained on MedMD and subsequently
+domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,
+containing 3M radiologic visual-language pairs. (iii), we propose a new
+evaluation benchmark that comprises five tasks, aiming to comprehensively
+assess the capability of foundation models in handling practical clinical
+problems. Our experimental results confirm that RadFM significantly outperforms
+existing multi-modal foundation models. The codes, data, and model checkpoint
+will all be made publicly available to promote further research and development
+in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trained <span class="highlight-title">Transformer</span>s Learn Linear Models In-Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09927v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09927v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Zhang, Spencer Frei, Peter L. Bartlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention-based neural networks such as transformers have demonstrated a
+remarkable ability to exhibit in-context learning (ICL): Given a short prompt
+sequence of tokens from an unseen task, they can formulate relevant per-token
+and next-token predictions without any parameter updates. By embedding a
+sequence of labeled training data and unlabeled test data as a prompt, this
+allows for transformers to behave like supervised learning algorithms. Indeed,
+recent work has shown that when training transformer architectures over random
+instances of linear regression problems, these models' predictions mimic those
+of ordinary least squares.
+  Towards understanding the mechanisms underlying this phenomenon, we
+investigate the dynamics of ICL in transformers with a single linear
+self-attention layer trained by gradient flow on linear regression tasks. We
+show that despite non-convexity, gradient flow with a suitable random
+initialization finds a global minimum of the objective function. At this global
+minimum, when given a test prompt of labeled examples from a new prediction
+task, the transformer achieves prediction error competitive with the best
+linear predictor over the test prompt distribution. We additionally
+characterize the robustness of the trained transformer to a variety of
+distribution shifts and show that although a number of shifts are tolerated,
+shifts in the covariate distribution of the prompts are not. Motivated by this,
+we consider a generalized ICL setting where the covariate distributions can
+vary across prompts. We show that although gradient flow succeeds at finding a
+global minimum in this setting, the trained transformer is still brittle under
+mild covariate shifts. We complement this finding with experiments on large,
+nonlinear transformer architectures which we show are more robust under
+covariate shifts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, experiments added, reference added, typo corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Can <span class="highlight-title">Transformer</span>s Learn In-Context? A Case Study of Simple Function
+  Classes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.01066v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.01066v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Garg, Dimitris Tsipras, Percy Liang, Gregory Valiant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning refers to the ability of a model to condition on a prompt
+sequence consisting of in-context examples (input-output pairs corresponding to
+some task) along with a new query input, and generate the corresponding output.
+Crucially, in-context learning happens only at inference time without any
+parameter updates to the model. While large language models such as GPT-3
+exhibit some ability to perform in-context learning, it is unclear what the
+relationship is between tasks on which this succeeds and what is present in the
+training data. To make progress towards understanding in-context learning, we
+consider the well-defined problem of training a model to in-context learn a
+function class (e.g., linear functions): that is, given data derived from some
+functions in the class, can we train a model to in-context learn "most"
+functions from this class? We show empirically that standard Transformers can
+be trained from scratch to perform in-context learning of linear functions --
+that is, the trained model is able to learn unseen linear functions from
+in-context examples with performance comparable to the optimal least squares
+estimator. In fact, in-context learning is possible even under two forms of
+distribution shift: (i) between the training data of the model and
+inference-time prompts, and (ii) between the in-context examples and the query
+input during inference. We also show that we can train Transformers to
+in-context learn more complex function classes -- namely sparse linear
+functions, two-layer neural networks, and decision trees -- with performance
+that matches or exceeds task-specific learning algorithms. Our code and models
+are available at https://github.com/dtsip/in-context-learning .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Single-Sentence Reader: A Novel Approach for Addressing Answer Position
+  Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04566v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04566v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Son Quoc Tran, Matt Kretchmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Reading Comprehension (MRC) models tend to take advantage of spurious
+correlations (also known as dataset bias or annotation artifacts in the
+research community). Consequently, these models may perform the MRC task
+without fully comprehending the given context and question, which is
+undesirable since it may result in low robustness against distribution shift.
+This paper delves into the concept of answer-position bias, where a significant
+percentage of training questions have answers located solely in the first
+sentence of the context. We propose a Single-Sentence Reader as a new approach
+for addressing answer position bias in MRC. We implement this approach using
+six different models and thoroughly analyze their performance. Remarkably, our
+proposed Single-Sentence Readers achieve results that nearly match those of
+models trained on conventional training sets, proving their effectiveness. Our
+study also discusses several challenges our Single-Sentence Readers encounter
+and proposes a potential solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured Chain-of-Thought <span class="highlight-title">Prompt</span>ing for Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Allen Li, Ge Li, Yongmin Li, Zhi Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive
+performance in code generation. LLMs take prompts as inputs, and
+Chain-of-Thought (CoT) prompting is the state-of-the-art prompting technique.
+CoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural
+language reasoning steps) and then output the code. However, CoT prompting is
+designed for natural language generation and has low accuracy in code
+generation.
+  In this paper, we propose Structured CoTs (SCoTs) and present a novel
+prompting technique for code generation, named SCoT prompting. Our motivation
+is source code contains rich structural information and any code can be
+composed of three program structures (i.e., sequence, branch, and loop
+structures). Intuitively, structured intermediate reasoning steps make for
+structured source code. Thus, we ask LLMs to use program structures to build
+CoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs.
+Compared to CoT prompting, SCoT prompting explicitly constrains LLMs to think
+about how to solve requirements from the view of source code and further the
+performance of LLMs in code generation. We apply SCoT prompting to two LLMs
+(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval,
+MBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline
+- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human
+developers prefer programs from SCoT prompting. (3) SCoT prompting is robust to
+examples and achieves substantial improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.17780</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">91</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FunnyBirds: A Synthetic Vision <span class="highlight-title">Dataset</span> for a Part-Based Analysis of
+  Explainable AI Methods <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Hesse, Simone Schaub-Meyer, Stefan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of explainable artificial intelligence (XAI) aims to uncover the
+inner workings of complex deep neural models. While being crucial for
+safety-critical domains, XAI inherently lacks ground-truth explanations, making
+its automatic evaluation an unsolved problem. We address this challenge by
+proposing a novel synthetic vision dataset, named FunnyBirds, and accompanying
+automatic evaluation protocols. Our dataset allows performing semantically
+meaningful image interventions, e.g., removing individual object parts, which
+has three important implications. First, it enables analyzing explanations on a
+part level, which is closer to human comprehension than existing methods that
+evaluate on a pixel level. Second, by comparing the model output for inputs
+with removed parts, we can estimate ground-truth part importances that should
+be reflected in the explanations. Third, by mapping individual explanations
+into a common space of part importances, we can analyze a variety of different
+explanation types in a single common framework. Using our tools, we report
+results for 24 different combinations of neural models and XAI methods,
+demonstrating the strengths and weaknesses of the assessed methods in a fully
+automatic and systematic manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Face Forgery Detection via Historical Distribution Preserving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Sun, Shen Chen, Taiping Yao, Xiaoshuai Sun, Shouhong Ding, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face forgery techniques have advanced rapidly and pose serious security
+threats. Existing face forgery detection methods try to learn generalizable
+features, but they still fall short of practical application. Additionally,
+finetuning these methods on historical training data is resource-intensive in
+terms of time and storage. In this paper, we focus on a novel and challenging
+problem: Continual Face Forgery Detection (CFFD), which aims to efficiently
+learn from new forgery attacks without forgetting previous ones. Specifically,
+we propose a Historical Distribution Preserving (HDP) framework that reserves
+and preserves the distributions of historical faces. To achieve this, we use
+universal adversarial perturbation (UAP) to simulate historical forgery
+distribution, and knowledge distillation to maintain the distribution variation
+of real faces across different models. We also construct a new benchmark for
+CFFD with three evaluation protocols. Our extensive experiments on the
+benchmarks show that our method outperforms the state-of-the-art competitors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Predicate Visual Context in Detecting of Human-Object
+  Interactions <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederic Z. Zhang, Yuhui Yuan, Dylan Campbell, Zhuoyao Zhong, Stephen Gould
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the DETR framework has emerged as the dominant approach for
+human--object interaction (HOI) research. In particular, two-stage
+transformer-based HOI detectors are amongst the most performant and
+training-efficient approaches. However, these often condition HOI
+classification on object features that lack fine-grained contextual
+information, eschewing pose and orientation information in favour of visual
+cues about object identity and box extremities. This naturally hinders the
+recognition of complex or ambiguous interactions. In this work, we study these
+issues through visualisations and carefully designed experiments. Accordingly,
+we investigate how best to re-introduce image features via cross-attention.
+With an improved query design, extensive exploration of keys and values, and
+box pair positional embeddings as spatial guidance, our model with enhanced
+predicate visual context (PViC) outperforms state-of-the-art methods on the
+HICO-DET and V-COCO benchmarks, while maintaining low training cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DIG In: Evaluating Disparities in Image Generations with Indicators for
+  Geographic Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melissa Hall, Candace Ross, Adina Williams, Nicolas Carion, Michal Drozdzal, Adriana Romero Soriano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented photorealistic results achieved by recent text-to-image
+generative systems and their increasing use as plug-and-play content creation
+solutions make it crucial to understand their potential biases. In this work,
+we introduce three indicators to evaluate the realism, diversity and
+prompt-generation consistency of text-to-image generative systems when prompted
+to generate objects from across the world. Our indicators complement
+qualitative analysis of the broader impact of such systems by enabling
+automatic and efficient benchmarking of geographic disparities, an important
+step towards building responsible visual content creation systems. We use our
+proposed indicators to analyze potential geographic biases in state-of-the-art
+visual content creation systems and find that: (1) models have less realism and
+diversity of generations when prompting for Africa and West Asia than Europe,
+(2) prompting with geographic information comes at a cost to prompt-consistency
+and diversity of generated images, and (3) models exhibit more region-level
+disparities for some objects than others. Perhaps most interestingly, our
+indicators suggest that progress in image generation quality has come at the
+cost of real-world geographic representation. Our comprehensive evaluation
+constitutes a crucial step towards ensuring a positive experience of visual
+content creation for everyone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Complex Facial Expression Recognition Using Deep Knowledge Distillation
+  of Basic Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angus Maiden, Bahareh Nakisa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex emotion recognition is a cognitive task that has so far eluded the
+same excellent performance of other tasks that are at or above the level of
+human cognition. Emotion recognition through facial expressions is particularly
+difficult due to the complexity of emotions expressed by the human face. For a
+machine to approach the same level of performance in this domain as a human, it
+may need to synthesise knowledge and understand new concepts in real-time as
+humans do. Humans are able to learn new concepts using only few examples, by
+distilling the important information from memories and discarding the rest.
+Similarly, continual learning methods learn new classes whilst retaining the
+knowledge of known classes, whilst few-shot learning methods are able to learn
+new classes using very few training examples. We propose a novel continual
+learning method inspired by human cognition and learning that can accurately
+recognise new compound expression classes using few training samples, by
+building on and retaining its knowledge of basic expression classes. Using
+GradCAM visualisations, we demonstrate the relationship between basic and
+compound facial expressions, which our method leverages through knowledge
+distillation and a novel Predictive Sorting Memory Replay. Our method achieves
+the current state-of-the-art in continual learning for complex facial
+expression recognition with 74.28% Overall Accuracy on new classes. We also
+demonstrate that using continual learning for complex facial expression
+recognition achieves far better performance than non-continual learning
+methods, improving on state-of-the-art non-continual learning methods by
+13.95%. To the best of our knowledge, our work is also the first to apply
+few-shot learning to complex facial expression recognition, achieving the
+state-of-the-art with 100% accuracy using a single training sample for each
+expression class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures, 6 tables. Code available at
+  https://github.com/AngusMaiden/complex-FER</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physical Adversarial Attacks For Camera-based Smart Systems: Current
+  Trends, Categorization, Applications, Research Challenges, and Future Outlook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amira Guesmi, Muhammad Abdullah Hanif, Bassem Ouni, Muhammed Shafique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a comprehensive survey of the current trends
+focusing specifically on physical adversarial attacks. We aim to provide a
+thorough understanding of the concept of physical adversarial attacks,
+analyzing their key characteristics and distinguishing features. Furthermore,
+we explore the specific requirements and challenges associated with executing
+attacks in the physical world. Our article delves into various physical
+adversarial attack methods, categorized according to their target tasks in
+different applications, including classification, detection, face recognition,
+semantic segmentation and depth estimation. We assess the performance of these
+attack methods in terms of their effectiveness, stealthiness, and robustness.
+We examine how each technique strives to ensure the successful manipulation of
+DNNs while mitigating the risk of detection and withstanding real-world
+distortions. Lastly, we discuss the current challenges and outline potential
+future research directions in the field of physical adversarial attacks. We
+highlight the need for enhanced defense mechanisms, the exploration of novel
+attack strategies, the evaluation of attacks in different application domains,
+and the establishment of standardized benchmarks and evaluation criteria for
+physical adversarial attacks. Through this comprehensive survey, we aim to
+provide a valuable resource for researchers, practitioners, and policymakers to
+gain a holistic understanding of physical adversarial attacks in computer
+vision and facilitate the development of robust and secure DNN-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking the Localization in Weakly Supervised Object Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Xu, Yong Luo, Han Hu, Bo Du, Jialie Shen, Yonggang Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised object localization (WSOL) is one of the most popular and
+challenging tasks in computer vision. This task is to localize the objects in
+the images given only the image-level supervision. Recently, dividing WSOL into
+two parts (class-agnostic object localization and object classification) has
+become the state-of-the-art pipeline for this task. However, existing solutions
+under this pipeline usually suffer from the following drawbacks: 1) they are
+not flexible since they can only localize one object for each image due to the
+adopted single-class regression (SCR) for localization; 2) the generated pseudo
+bounding boxes may be noisy, but the negative impact of such noise is not well
+addressed. To remedy these drawbacks, we first propose to replace SCR with a
+binary-class detector (BCD) for localizing multiple objects, where the detector
+is trained by discriminating the foreground and background. Then we design a
+weighted entropy (WE) loss using the unlabeled data to reduce the negative
+impact of noisy bounding boxes. Extensive experiments on the popular
+CUB-200-2011 and ImageNet-1K datasets demonstrate the effectiveness of our
+method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM International Conference on Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span>DM: Synthesizing Data with Perception Annotations Using Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Wu, Yuzhong Zhao, Hao Chen, Yuchao Gu, Rui Zhao, Yefei He, Hong Zhou, Mike Zheng Shou, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current deep networks are very data-hungry and benefit from training on
+largescale datasets, which are often time-consuming to collect and annotate. By
+contrast, synthetic data can be generated infinitely using generative models
+such as DALL-E and diffusion models, with minimal effort and cost. In this
+paper, we present DatasetDM, a generic dataset generation model that can
+produce diverse synthetic images and the corresponding high-quality perception
+annotations (e.g., segmentation masks, and depth). Our method builds upon the
+pre-trained diffusion model and extends text-guided image synthesis to
+perception data generation. We show that the rich latent code of the diffusion
+model can be effectively decoded as accurate perception annotations using a
+decoder module. Training the decoder only needs less than 1% (around 100
+images) manually labeled images, enabling the generation of an infinitely large
+annotated dataset. Then these synthetic data can be used for training various
+perception models for downstream tasks. To showcase the power of the proposed
+approach, we generate datasets with rich dense pixel-wise labels for a wide
+range of downstream tasks, including semantic segmentation, instance
+segmentation, and depth estimation. Notably, it achieves 1) state-of-the-art
+results on semantic segmentation and instance segmentation; 2) significantly
+more robust on domain generalization than using the real data alone; and
+state-of-the-art results in zero-shot segmentation setting; and 3) flexibility
+for efficient application and novel task composition (e.g., image editing). The
+project website and code can be found at
+https://weijiawu.github.io/DatasetDM_page/ and
+https://github.com/showlab/DatasetDM, respectively
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Large-scale AUV-based Visual Seafloor Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengkun She, Yifan Song, David Nakath, Kevin Köser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the increasing number of marine data science applications, there is
+a growing interest in surveying and exploring the vast, uncharted terrain of
+the deep sea with robotic platforms. Despite impressive results achieved by
+many on-land visual mapping algorithms in the past decades, transferring these
+methods from land to the deep sea remains a challenge due to harsh
+environmental conditions. Typically, deep-sea exploration involves the use of
+autonomous underwater vehicles (AUVs) equipped with high-resolution cameras and
+artificial illumination systems. However, images obtained in this manner often
+suffer from heterogeneous illumination and quality degradation due to
+attenuation and scattering, on top of refraction of light rays. All of this
+together often lets on-land SLAM approaches fail underwater or makes
+Structure-from-Motion approaches drift or omit difficult images, resulting in
+gaps, jumps or weakly registered areas. In this work, we present a system that
+incorporates recent developments in underwater imaging and visual mapping to
+facilitate automated robotic 3D reconstruction of hectares of seafloor. Our
+approach is efficient in that it detects and reconsiders difficult, weakly
+registered areas, to avoid omitting images and to make better use of limited
+dive time; on the other hand it is computationally efficient; leveraging a
+hybrid approach combining benefits from SLAM and Structure-from-Motion that
+runs much faster than incremental reconstructions while achieving at least
+on-par performance. The proposed system has been extensively tested and
+evaluated during several research cruises, demonstrating its robustness and
+practicality in real-world conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging
+  Handwritten Documents using Deep Feature Learning from JPEG Coefficients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bulla Rajesh, Sk Mahafuz Zaman, Mohammed Javed, P. Nagabhushan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic localization of text-lines in handwritten documents is still an
+open and challenging research problem. Various writing issues such as uneven
+spacing between the lines, oscillating and touching text, and the presence of
+skew become much more challenging when the case of complex handwritten document
+images are considered for segmentation directly in their respective compressed
+representation. This is because, the conventional way of processing compressed
+documents is through decompression, but here in this paper, we propose an idea
+that employs deep feature learning directly from the JPEG compressed
+coefficients without full decompression to accomplish text-line localization in
+the JPEG compressed domain. A modified U-Net architecture known as Compressed
+Text-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The
+model is trained and tested with JPEG compressed version of benchmark datasets
+including ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art
+performance with reduced storage and computational costs in the JPEG compressed
+domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),
+  5-8 November 2023, Kitakyushu, Japan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification for Image-based Traffic Prediction across
+  Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Timans, Nina Wiedemann, Nishant Kumar, Ye Hong, Martin Raubal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the strong predictive performance of deep learning models for traffic
+prediction, their widespread deployment in real-world intelligent
+transportation systems has been restrained by a lack of interpretability.
+Uncertainty quantification (UQ) methods provide an approach to induce
+probabilistic reasoning, improve decision-making and enhance model deployment
+potential. To gain a comprehensive picture of the usefulness of existing UQ
+methods for traffic prediction and the relation between obtained uncertainties
+and city-wide traffic dynamics, we investigate their application to a
+large-scale image-based traffic dataset spanning multiple cities and time
+periods. We compare two epistemic and two aleatoric UQ methods on both temporal
+and spatio-temporal transfer tasks, and find that meaningful uncertainty
+estimates can be recovered. We further demonstrate how uncertainty estimates
+can be employed for unsupervised outlier detection on changes in city traffic
+dynamics. We find that our approach can capture both temporal and spatial
+effects on traffic behaviour in a representative case study for the city of
+Moscow. Our work presents a further step towards boosting uncertainty awareness
+in traffic prediction tasks, and aims to highlight the value contribution of UQ
+methods to a better understanding of city traffic dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 22 figures. Code publicly available at:
+  https://github.com/alextimans/traffic4cast-uncertainty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Taming the Power of Diffusion Models for High-Quality Virtual Try-On
+  with Appearance Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhong Gou, Siyu Sun, Jianfu Zhang, Jianlou Si, Chen Qian, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on is a critical image synthesis task that aims to transfer
+clothes from one image to another while preserving the details of both humans
+and clothes. While many existing methods rely on Generative Adversarial
+Networks (GANs) to achieve this, flaws can still occur, particularly at high
+resolutions. Recently, the diffusion model has emerged as a promising
+alternative for generating high-quality images in various applications.
+However, simply using clothes as a condition for guiding the diffusion model to
+inpaint is insufficient to maintain the details of the clothes. To overcome
+this challenge, we propose an exemplar-based inpainting approach that leverages
+a warping module to guide the diffusion model's generation effectively. The
+warping module performs initial processing on the clothes, which helps to
+preserve the local details of the clothes. We then combine the warped clothes
+with clothes-agnostic person image and add noise as the input of diffusion
+model. Additionally, the warped clothes is used as local conditions for each
+denoising process to ensure that the resulting output retains as much detail as
+possible. Our approach, namely Diffusion-based Conditional Inpainting for
+Virtual Try-ON (DCI-VTON), effectively utilizes the power of the diffusion
+model, and the incorporation of the warping module helps to produce
+high-quality and realistic virtual try-on results. Experimental results on
+VITON-HD demonstrate the effectiveness and superiority of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-based Visual Counterfactual Explanations -- Towards Systematic
+  Quantitative Evaluation <span class="chip">ECML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Vaeth, Alexander M. Fruehwald, Benjamin Paassen, Magda Gregorova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latest methods for visual counterfactual explanations (VCE) harness the power
+of deep generative models to synthesize new examples of high-dimensional images
+of impressive quality. However, it is currently difficult to compare the
+performance of these VCE methods as the evaluation procedures largely vary and
+often boil down to visual inspection of individual examples and small scale
+user studies. In this work, we propose a framework for systematic, quantitative
+evaluation of the VCE methods and a minimal set of metrics to be used. We use
+this framework to explore the effects of certain crucial design choices in the
+latest diffusion-based generative models for VCEs of natural image
+classification (ImageNet). We conduct a battery of ablation-like experiments,
+generating thousands of VCEs for a suite of classifiers of various complexity,
+accuracy and robustness. Our findings suggest multiple directions for future
+advancements and improvements of VCE methods. By sharing our methodology and
+our approach to tackle the computational challenges of such a study on a
+limited hardware setup (including the complete code base), we offer a valuable
+guidance for researchers in the field fostering consistency and transparency in
+the assessment of counterfactual explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 5th International Workshop on eXplainable Knowledge
+  Discovery in Data Mining @ ECML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Construction of Time-Space Diagrams for Traffic Analysis Using
+  Street-View Video Sequence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanay Rastogi, Mårten Björkman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-space diagrams are essential tools for analyzing traffic patterns and
+optimizing transportation infrastructure and traffic management strategies.
+Traditional data collection methods for these diagrams have limitations in
+terms of temporal and spatial coverage. Recent advancements in camera
+technology have overcome these limitations and provided extensive urban data.
+In this study, we propose an innovative approach to constructing time-space
+diagrams by utilizing street-view video sequences captured by cameras mounted
+on moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and
+photogrammetry techniques for distance calculation, we can infer vehicle
+trajectories from the video data and generate time-space diagrams. To evaluate
+the effectiveness of our proposed method, we utilized datasets from the KITTI
+computer vision benchmark suite. The evaluation results demonstrate that our
+approach can generate trajectories from video data, although there are some
+errors that can be mitigated by improving the performance of the detector,
+tracker, and distance calculation components. In conclusion, the utilization of
+street-view video sequences captured by cameras mounted on moving vehicles,
+combined with state-of-the-art computer vision techniques, has immense
+potential for constructing comprehensive time-space diagrams. These diagrams
+offer valuable insights into traffic patterns and contribute to the design of
+transportation infrastructure and traffic management strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RIGID: Recurrent GAN Inversion and Editing of Real Face Videos <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyang Xu, Shengfeng He, Kwan-Yee K. Wong, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GAN inversion is indispensable for applying the powerful editability of GAN
+to real images. However, existing methods invert video frames individually
+often leading to undesired inconsistent results over time. In this paper, we
+propose a unified recurrent framework, named \textbf{R}ecurrent v\textbf{I}deo
+\textbf{G}AN \textbf{I}nversion and e\textbf{D}iting (RIGID), to explicitly and
+simultaneously enforce temporally coherent GAN inversion and facial editing of
+real videos. Our approach models the temporal relations between current and
+previous frames from three aspects. To enable a faithful real video
+reconstruction, we first maximize the inversion fidelity and consistency by
+learning a temporal compensated latent code. Second, we observe incoherent
+noises lie in the high-frequency domain that can be disentangled from the
+latent space. Third, to remove the inconsistency after attribute manipulation,
+we propose an \textit{in-between frame composition constraint} such that the
+arbitrary frame must be a direct composite of its neighboring frames. Our
+unified framework learns the inherent coherence between input frames in an
+end-to-end manner, and therefore it is agnostic to a specific attribute and can
+be applied to arbitrary editing of the same video without re-training.
+Extensive experiments demonstrate that RIGID outperforms state-of-the-art
+methods qualitatively and quantitatively in both inversion and editing tasks.
+The deliverables can be found in \url{https://cnnlstm.github.io/RIGID}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Experts Weights Averaging: A New General Training Scheme for Vision
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Huang, Peng Ye, Xiaoshui Huang, Sheng Li, Tao Chen, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural re-parameterization is a general training scheme for Convolutional
+Neural Networks (CNNs), which achieves performance improvement without
+increasing inference cost. As Vision Transformers (ViTs) are gradually
+surpassing CNNs in various visual tasks, one may question: if a training scheme
+specifically for ViTs exists that can also achieve performance improvement
+without increasing inference cost? Recently, Mixture-of-Experts (MoE) has
+attracted increasing attention, as it can efficiently scale up the capacity of
+Transformers at a fixed cost through sparsely activated experts. Considering
+that MoE can also be viewed as a multi-branch structure, can we utilize MoE to
+implement a ViT training scheme similar to structural re-parameterization? In
+this paper, we affirmatively answer these questions, with a new general
+training strategy for ViTs. Specifically, we decouple the training and
+inference phases of ViTs. During training, we replace some Feed-Forward
+Networks (FFNs) of the ViT with specially designed, more efficient MoEs that
+assign tokens to experts by random uniform partition, and perform Experts
+Weights Averaging (EWA) on these MoEs at the end of each iteration. After
+training, we convert each MoE into an FFN by averaging the experts,
+transforming the model back into original ViT for inference. We further provide
+a theoretical analysis to show why and how it works. Comprehensive experiments
+across various 2D and 3D visual tasks, ViT architectures, and datasets validate
+the effectiveness and generalizability of the proposed training scheme.
+Besides, our training scheme can also be applied to improve performance when
+fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can
+significantly improve the effectiveness of naive MoE in various 2D visual small
+datasets and 3D visual tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD
+  Space <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Wang, Haozhe Wu, Junliang Xing, Jia Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating realistic 3D facial animation is crucial for various applications in
+the movie production and gaming industry, especially with the burgeoning demand
+in the metaverse. However, prevalent methods such as blendshape-based
+approaches and facial rigging techniques are time-consuming, labor-intensive,
+and lack standardized configurations, making facial animation production
+challenging and costly. In this paper, we propose a novel self-supervised
+framework, Versatile Face Animator, which combines facial motion capture with
+motion retargeting in an end-to-end manner, eliminating the need for
+blendshapes or rigs. Our method has the following two main characteristics: 1)
+we propose an RGBD animation module to learn facial motion from raw RGBD videos
+by hierarchical motion dictionaries and animate RGBD images rendered from 3D
+facial mesh coarse-to-fine, enabling facial animation on arbitrary 3D
+characters regardless of their topology, textures, blendshapes, and rigs; and
+2) we introduce a mesh retarget module to utilize RGBD animation to create 3D
+facial animation by manipulating facial mesh with controller transformations,
+which are estimated from dense optical flow fields and blended together with
+geodesic-distance-based weights. Comprehensive experiments demonstrate the
+effectiveness of our proposed framework in generating impressive 3D facial
+animation results, highlighting its potential as a promising solution for the
+cost-effective and efficient production of facial animation in the metaverse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Out-of-Distribution Detection for Monocular Depth Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Hornauer, Adrian Holzbock, Vasileios Belagiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In monocular depth estimation, uncertainty estimation approaches mainly
+target the data uncertainty introduced by image noise. In contrast to prior
+work, we address the uncertainty due to lack of knowledge, which is relevant
+for the detection of data not represented by the training distribution, the
+so-called out-of-distribution (OOD) data. Motivated by anomaly detection, we
+propose to detect OOD images from an encoder-decoder depth estimation model
+based on the reconstruction error. Given the features extracted with the fixed
+depth encoder, we train an image decoder for image reconstruction using only
+in-distribution data. Consequently, OOD images result in a high reconstruction
+error, which we use to distinguish between in- and out-of-distribution samples.
+We built our experiments on the standard NYU Depth V2 and KITTI benchmarks as
+in-distribution data. Our post hoc method performs astonishingly well on
+different models and outperforms existing uncertainty estimation approaches
+without modifying the trained encoder-decoder depth estimation model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Head Rotation in Denoising Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Asperti, Gabriele Colasuonno, Antonio Guerra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising Diffusion Models (DDM) are emerging as the cutting-edge technology
+in the realm of deep generative modeling, challenging the dominance of
+Generative Adversarial Networks. However, effectively exploring the latent
+space's semantics and identifying compelling trajectories for manipulating and
+editing important attributes of the generated samples remains challenging,
+primarily due to the high-dimensional nature of the latent space. In this
+study, we specifically concentrate on face rotation, which is known to be one
+of the most intricate editing operations. By leveraging a recent embedding
+technique for Denoising Diffusion Implicit Models (DDIM), we achieve, in many
+cases, noteworthy manipulations encompassing a wide rotation angle of $\pm
+30^o$, preserving the distinct characteristics of the individual. Our
+methodology exploits the computation of trajectories approximating clouds of
+latent representations of dataset samples with different yaw rotations through
+linear regression. Specific trajectories are obtained by restricting the
+analysis to subsets of data sharing significant attributes with the source
+image. One of these attributes is the light provenance: a byproduct of our
+research is a labeling of CelebA, categorizing images into three major groups
+based on the illumination direction: left, center, and right.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computer-Aided Cytology Diagnosis in Animals: CNN-Based Image Quality
+  Assessment for Accurate Disease Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Krupiński, Maciej Wielgosz, Szymon Mazurek, Krystian Strzałka, Paweł Russek, Jakub Caputa, Daria Łukasik, Jakub Grzeszczyk, Michał Karwatowski, Rafał Fraczek, Ernest Jamro, Marcin Pietroń, Sebastian Koryciak, Agnieszka Dąbrowska-Boruch, Kazimierz Wiatr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a computer-aided cytology diagnosis system designed for
+animals, focusing on image quality assessment (IQA) using Convolutional Neural
+Networks (CNNs). The system's building blocks are tailored to seamlessly
+integrate IQA, ensuring reliable performance in disease classification. We
+extensively investigate the CNN's ability to handle various image variations
+and scenarios, analyzing the impact on detecting low-quality input data.
+Additionally, the network's capacity to differentiate valid cellular samples
+from those with artifacts is evaluated. Our study employs a ResNet18 network
+architecture and explores the effects of input sizes and cropping strategies on
+model performance. The research sheds light on the significance of CNN-based
+IQA in computer-aided cytology diagnosis for animals, enhancing the accuracy of
+disease classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hardware Accelerators in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ken Power, Shailendra Deva, Ting Wang, Julius Li, Ciarán Eising
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing platforms in autonomous vehicles record large amounts of data from
+many sensors, process the data through machine learning models, and make
+decisions to ensure the vehicle's safe operation. Fast, accurate, and reliable
+decision-making is critical. Traditional computer processors lack the power and
+flexibility needed for the perception and machine vision demands of advanced
+autonomous driving tasks. Hardware accelerators are special-purpose
+coprocessors that help autonomous vehicles meet performance requirements for
+higher levels of autonomy. This paper provides an overview of ML accelerators
+with examples of their use for machine vision in autonomous vehicles. We offer
+recommendations for researchers and practitioners and highlight a trajectory
+for ongoing and future research in this emerging field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Instance-adaptive Inference for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Mei Feng, Kai Yu, Nian Liu, Xinxing Xu, Salman Khan, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed learning paradigm that enables
+multiple clients to learn a powerful global model by aggregating local
+training. However, the performance of the global model is often hampered by
+non-i.i.d. distribution among the clients, requiring extensive efforts to
+mitigate inter-client data heterogeneity. Going beyond inter-client data
+heterogeneity, we note that intra-client heterogeneity can also be observed on
+complex real-world data and seriously deteriorate FL performance. In this
+paper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client
+data heterogeneity by enabling instance-adaptive inference in the FL framework.
+Instead of huge instance-adaptive models, we resort to a parameter-efficient
+fine-tuning method, i.e., scale and shift deep features (SSF), upon a
+pre-trained model. Specifically, we first train an SSF pool for each client,
+and aggregate these SSF pools on the server side, thus still maintaining a low
+communication cost. To enable instance-adaptive inference, for a given
+instance, we dynamically find the best-matched SSF subsets from the pool and
+aggregate them to generate an adaptive SSF specified for the instance, thereby
+reducing the intra-client as well as the inter-client heterogeneity. Extensive
+experiments show that our FedIns outperforms state-of-the-art FL algorithms,
+e.g., a 6.64\% improvement against the top-performing method with less than
+15\% communication cost on Tiny-ImageNet. Our code and models will be publicly
+released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diverse Data Augmentation with Diffusions for Effective Test-time <span class="highlight-title">Prompt</span>
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Mei Feng, Kai Yu, Yong Liu, Salman Khan, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from prompt tuning, recent years have witnessed the promising
+performance of pre-trained vision-language models, e.g., CLIP, on versatile
+downstream tasks. In this paper, we focus on a particular setting of learning
+adaptive prompts on the fly for each test sample from an unseen new domain,
+which is known as test-time prompt tuning (TPT). Existing TPT methods typically
+rely on data augmentation and confidence selection. However, conventional data
+augmentation techniques, e.g., random resized crops, suffers from the lack of
+data diversity, while entropy-based confidence selection alone is not
+sufficient to guarantee prediction fidelity. To address these issues, we
+propose a novel TPT method, named DiffTPT, which leverages pre-trained
+diffusion models to generate diverse and informative new data. Specifically, we
+incorporate augmented data by both conventional method and pre-trained stable
+diffusion to exploit their respective merits, improving the models ability to
+adapt to unknown new test data. Moreover, to ensure the prediction fidelity of
+generated data, we introduce a cosine similarity-based filtration technique to
+select the generated data with higher similarity to the single test sample. Our
+experiments on test datasets with distribution shifts and unseen categories
+demonstrate that DiffTPT improves the zero-shot accuracy by an average of
+5.13\% compared to the state-of-the-art TPT method. Our code and models will be
+publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked-Attention Diffusion Guidance for Spatially Controlling
+  Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Endo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image synthesis has achieved high-quality results with recent
+advances in diffusion models. However, text input alone has high spatial
+ambiguity and limited user controllability. Most existing methods allow spatial
+control through additional visual guidance (e.g, sketches and semantic masks)
+but require additional training with annotated images. In this paper, we
+propose a method for spatially controlling text-to-image generation without
+further training of diffusion models. Our method is based on the insight that
+the cross-attention maps reflect the positional relationship between words and
+pixels. Our aim is to control the attention maps according to given semantic
+masks and text prompts. To this end, we first explore a simple approach of
+directly swapping the cross-attention maps with constant maps computed from the
+semantic regions. Moreover, we propose masked-attention guidance, which can
+generate images more faithful to semantic masks than the first approach.
+Masked-attention guidance indirectly controls attention to each word and pixel
+according to the semantic regions by manipulating noise images fed to diffusion
+models. Experiments show that our method enables more accurate spatial control
+than baselines qualitatively and quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial-information Guided Adaptive Context-aware Network for Efficient
+  RGB-D Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zhang, Chenyun Xiong, Junjie Liu, Xuhui Ye, Guodong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient RGB-D semantic segmentation has received considerable attention in
+mobile robots, which plays a vital role in analyzing and recognizing
+environmental information. According to previous studies, depth information can
+provide corresponding geometric relationships for objects and scenes, but
+actual depth data usually exist as noise. To avoid unfavorable effects on
+segmentation accuracy and computation, it is necessary to design an efficient
+framework to leverage cross-modal correlations and complementary cues. In this
+paper, we propose an efficient lightweight encoder-decoder network that reduces
+the computational parameters and guarantees the robustness of the algorithm.
+Working with channel and spatial fusion attention modules, our network
+effectively captures multi-level RGB-D features. A globally guided local
+affinity context module is proposed to obtain sufficient high-level context
+information. The decoder utilizes a lightweight residual unit that combines
+short- and long-distance information with a few redundant computations.
+Experimental results on NYUv2, SUN RGB-D, and Cityscapes datasets show that our
+method achieves a better trade-off among segmentation accuracy, inference time,
+and parameters than the state-of-the-art methods. The source code will be at
+https://github.com/MVME-HBUT/SGACNet
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Sensors Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scale-Preserving Automatic Concept Extraction (SPACE) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Felipe Posada-Moreno, Lukas Kreisköther, Tassilo Glander, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNN) have become a common choice for
+industrial quality control, as well as other critical applications in the
+Industry 4.0. When these CNNs behave in ways unexpected to human users or
+developers, severe consequences can arise, such as economic losses or an
+increased risk to human life. Concept extraction techniques can be applied to
+increase the reliability and transparency of CNNs through generating global
+explanations for trained neural network models. The decisive features of image
+datasets in quality control often depend on the feature's scale; for example,
+the size of a hole or an edge. However, existing concept extraction methods do
+not correctly represent scale, which leads to problems interpreting these
+models as we show herein. To address this issue, we introduce the
+Scale-Preserving Automatic Concept Extraction (SPACE) algorithm, as a
+state-of-the-art alternative concept extraction technique for CNNs, focused on
+industrial applications. SPACE is specifically designed to overcome the
+aforementioned problems by avoiding scale changes throughout the concept
+extraction process. SPACE proposes an approach based on square slices of input
+images, which are selected and then tiled before being clustered into concepts.
+Our method provides explanations of the models' decision-making process in the
+form of human-understandable concepts. We evaluate SPACE on three image
+classification datasets in the context of industrial quality control. Through
+experimental results, we illustrate how SPACE outperforms other methods and
+provides actionable insights on the decision mechanisms of CNNs. Finally, code
+for the implementation of SPACE is provided.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Generalization of Universal Adversarial Perturbation through
+  Gradient Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuannan Liu, Yaoyao Zhong, Yuhang Zhang, Lixiong Qin, Weihong Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to universal adversarial perturbation
+(UAP), an instance-agnostic perturbation capable of fooling the target model
+for most samples. Compared to instance-specific adversarial examples, UAP is
+more challenging as it needs to generalize across various samples and models.
+In this paper, we examine the serious dilemma of UAP generation methods from a
+generalization perspective -- the gradient vanishing problem using small-batch
+stochastic gradient optimization and the local optima problem using large-batch
+optimization. To address these problems, we propose a simple and effective
+method called Stochastic Gradient Aggregation (SGA), which alleviates the
+gradient vanishing and escapes from poor local optima at the same time.
+Specifically, SGA employs the small-batch training to perform multiple
+iterations of inner pre-search. Then, all the inner gradients are aggregated as
+a one-step gradient estimation to enhance the gradient stability and reduce
+quantization errors. Extensive experiments on the standard ImageNet dataset
+demonstrate that our method significantly enhances the generalization ability
+of UAP and outperforms other state-of-the-art methods. The code is available at
+https://github.com/liuxuannan/Stochastic-Gradient-Aggregation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViGT: Proposal-free Video Grounding with Learnable Token in <span class="highlight-title">Transformer</span> <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Li, Dan Guo, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The video grounding (VG) task aims to locate the queried action or event in
+an untrimmed video based on rich linguistic descriptions. Existing
+proposal-free methods are trapped in complex interaction between video and
+query, overemphasizing cross-modal feature fusion and feature correlation for
+VG. In this paper, we propose a novel boundary regression paradigm that
+performs regression token learning in a transformer. Particularly, we present a
+simple but effective proposal-free framework, namely Video Grounding
+Transformer (ViGT), which predicts the temporal boundary using a learnable
+regression token rather than multi-modal or cross-modal features. In ViGT, the
+benefits of a learnable token are manifested as follows. (1) The token is
+unrelated to the video or the query and avoids data bias toward the original
+video and query. (2) The token simultaneously performs global context
+aggregation from video and query features. First, we employed a sharing feature
+encoder to project both video and query into a joint feature space before
+performing cross-modal co-attention (i.e., video-to-query attention and
+query-to-video attention) to highlight discriminative features in each
+modality. Furthermore, we concatenated a learnable regression token [REG] with
+the video and query features as the input of a vision-language transformer.
+Finally, we utilized the token [REG] to predict the target moment and visual
+features to constrain the foreground and background probabilities at each
+timestamp. The proposed ViGT performed well on three public datasets: ANet
+Captions, TACoS and YouCookII. Extensive ablation studies and qualitative
+analysis further validated the interpretability of ViGT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by SCIENCE CHINA Information Sciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image-based Geolocalization by Ground-to-2.5D Map Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengjie Zhou, Liu Liu, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the image-based geolocalization problem that aims to locate
+ground-view query images on cartographic maps. Previous methods often utilize
+cross-view localization techniques to match ground-view query images with 2D
+maps. However, the performance of these methods is frequently unsatisfactory
+due to the significant cross-view appearance differences. In this paper, we
+extend cross-view matching to 2.5D spaces, where the heights of the structures
+- such as trees, buildings, and other objects - can provide additional
+information to guide the cross-view matching. We present a new approach to
+learning representative embeddings from multi-model data. Specifically, we
+first align 2D maps to ground-view panoramic images with polar transform to
+reduce the gap between panoramic images and maps. Then we leverage global
+fusion to fuse the multi-modal features from 2D and 2.5D maps to increase the
+distinctiveness of location embeddings. We construct the first large-scale
+ground-to-2.5D map geolocalization dataset to validate our method and
+facilitate the research. We test our learned embeddings on two popular
+localization approaches, i.e., single-image based localization, and route based
+localization. Extensive experiments demonstrate that our proposed method
+achieves significantly higher localization accuracy and faster convergence than
+previous 2D map-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Yin, Jiajun Deng, Wengang Zhou, Li Li, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in weakly supervised object detection is featured by a
+combination of multiple instance detection networks (MIDN) and ordinal online
+refinement. However, with only image-level annotation, MIDN inevitably assigns
+high scores to some unexpected region proposals when generating pseudo labels.
+These inaccurate high-scoring region proposals will mislead the training of
+subsequent refinement modules and thus hamper the detection performance. In
+this work, we explore how to ameliorate the quality of pseudo-labeling in MIDN.
+Formally, we devise Cyclic-Bootstrap Labeling (CBL), a novel weakly supervised
+object detection pipeline, which optimizes MIDN with rank information from a
+reliable teacher network. Specifically, we obtain this teacher network by
+introducing a weighted exponential moving average strategy to take advantage of
+various refinement modules. A novel class-specific ranking distillation
+algorithm is proposed to leverage the output of weighted ensembled teacher
+network for distilling MIDN with rank information. As a result, MIDN is guided
+to assign higher scores to accurate proposals among their neighboring ones,
+thus benefiting the subsequent pseudo labeling. Extensive experiments on the
+prevalent PASCAL VOC 2007 \& 2012 and COCO datasets demonstrate the superior
+performance of our CBL framework. Code will be available at
+https://github.com/Yinyf0804/WSOD-CBL/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MS3D++: Ensemble of Experts for Multi-Source Unsupervised Domain
+  Adaption in 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darren Tsai, Julie Stephany Berrio, Mao Shan, Eduardo Nebot, Stewart Worrall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying 3D detectors in unfamiliar domains has been demonstrated to result
+in a drastic drop of up to 70-90% in detection rate due to variations in lidar,
+geographical region, or weather conditions from their original training
+dataset. This domain gap leads to missing detections for densely observed
+objects, misaligned confidence scores, and increased high-confidence false
+positives, rendering the detector highly unreliable. To address this, we
+introduce MS3D++, a self-training framework for multi-source unsupervised
+domain adaptation in 3D object detection. MS3D++ provides a straightforward
+approach to domain adaptation by generating high-quality pseudo-labels,
+enabling the adaptation of 3D detectors to a diverse range of lidar types,
+regardless of their density. Our approach effectively fuses predictions of an
+ensemble of multi-frame pre-trained detectors from different source domains to
+improve domain generalization. We subsequently refine the predictions
+temporally to ensure temporal consistency in box localization and object
+classification. Furthermore, we present an in-depth study into the performance
+and idiosyncrasies of various 3D detector components in a cross-domain context,
+providing valuable insights for improved cross-domain detector ensembling.
+Experimental results on Waymo, nuScenes and Lyft demonstrate that detectors
+trained with MS3D++ pseudo-labels achieve state-of-the-art performance,
+comparable to training with human-annotated labels in Bird's Eye View (BEV)
+evaluation for both low and high density lidar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/darrenjkt/MS3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Face Encryption via Frequency-Restricted Identity-Agnostic Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Dong, Rui Wang, Siyuan Liang, Aishan Liu, Lihua Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Billions of people are sharing their daily live images on social media
+everyday. However, malicious collectors use deep face recognition systems to
+easily steal their biometric information (e.g., faces) from these images. Some
+studies are being conducted to generate encrypted face photos using adversarial
+attacks by introducing imperceptible perturbations to reduce face information
+leakage. However, existing studies need stronger black-box scenario feasibility
+and more natural visual appearances, which challenge the feasibility of privacy
+protection. To address these problems, we propose a frequency-restricted
+identity-agnostic (FRIA) framework to encrypt face images from unauthorized
+face recognition without access to personal information. As for the weak
+black-box scenario feasibility, we obverse that representations of the average
+feature in multiple face recognition models are similar, thus we propose to
+utilize the average feature via the crawled dataset from the Internet as the
+target to guide the generation, which is also agnostic to identities of unknown
+face recognition systems; in nature, the low-frequency perturbations are more
+visually perceptible by the human vision system. Inspired by this, we restrict
+the perturbation in the low-frequency facial regions by discrete cosine
+transform to achieve the visual naturalness guarantee. Extensive experiments on
+several face recognition models demonstrate that our FRIA outperforms other
+state-of-the-art methods in generating more natural encrypted faces while
+attaining high black-box attack success rates of 96%. In addition, we validate
+the efficacy of FRIA using real-world black-box commercial API, which reveals
+the potential of FRIA in practice. Our codes can be found in
+https://github.com/XinDong10/FRIA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Text-driven Physically Interpretable Face Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yapeng Meng, Songru Yang, Xu Hu, Rui Zhao, Lincheng Li, Zhenwei Shi, Zhengxia Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel and physically interpretable method for face
+editing based on arbitrary text prompts. Different from previous
+GAN-inversion-based face editing methods that manipulate the latent space of
+GANs, or diffusion-based methods that model image manipulation as a reverse
+diffusion process, we regard the face editing process as imposing vector flow
+fields on face images, representing the offset of spatial coordinates and color
+for each image pixel. Under the above-proposed paradigm, we represent the
+vector flow field in two ways: 1) explicitly represent the flow vectors with
+rasterized tensors, and 2) implicitly parameterize the flow vectors as
+continuous, smooth, and resolution-agnostic neural fields, by leveraging the
+recent advances of implicit neural representations. The flow vectors are
+iteratively optimized under the guidance of the pre-trained Contrastive
+Language-Image Pretraining~(CLIP) model by maximizing the correlation between
+the edited image and the text prompt. We also propose a learning-based one-shot
+face editing framework, which is fast and adaptable to any text prompt input.
+Our method can also be flexibly extended to real-time video face editing.
+Compared with state-of-the-art text-driven face editing methods, our method can
+generate physically interpretable face editing results with high identity
+consistency and image quality. Our code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Focused Specific Objects NeRF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuesong Li, Feng Pan, Helong Yan, Xiuli Xin, Xiaoxue Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most NeRF-based models are designed for learning the entire scene, and
+complex scenes can lead to longer learning times and poorer rendering effects.
+This paper utilizes scene semantic priors to make improvements in fast
+training, allowing the network to focus on the specific targets and not be
+affected by complex backgrounds. The training speed can be increased by 7.78
+times with better rendering effect, and small to medium sized targets can be
+rendered faster. In addition, this improvement applies to all NeRF-based
+models. Considering the inherent multi-view consistency and smoothness of NeRF,
+this paper also studies weak supervision by sparsely sampling negative ray
+samples. With this method, training can be further accelerated and rendering
+quality can be maintained. Finally, this paper extends pixel semantic and color
+rendering formulas and proposes a new scene editing technique that can achieve
+unique displays of the specific semantic targets or masking them in rendering.
+To address the problem of unsupervised regions incorrect inferences in the
+scene, we also designed a self-supervised loop that combines morphological
+operations and clustering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages,32 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenxiao Mei, Chenglong Ma, Feihong Shen, Huikai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting dental diseases through panoramic X-rays images is a standard
+procedure for dentists. Normally, a dentist need to identify diseases and find
+the infected teeth. While numerous machine learning models adopting this
+two-step procedure have been developed, there has not been an end-to-end model
+that can identify teeth and their associated diseases at the same time. To fill
+the gap, we develop YOLOrtho, a unified framework for teeth enumeration and
+dental disease detection. We develop our model on Dentex Challenge 2023 data,
+which consists of three distinct types of annotated data. The first part is
+labeled with quadrant, and the second part is labeled with quadrant and
+enumeration and the third part is labeled with quadrant, enumeration and
+disease. To further improve detection, we make use of Tufts Dental public
+dataset. To fully utilize the data and learn both teeth detection and disease
+identification simultaneously, we formulate diseases as attributes attached to
+their corresponding teeth. Due to the nature of position relation in teeth
+enumeration, We replace convolution layer with CoordConv in our model to
+provide more position information for the model. We also adjust the model
+architecture and insert one more upsampling layer in FPN in favor of large
+object detection. Finally, we propose a post-process strategy for teeth layout
+that corrects teeth enumeration based on linear sum assignment. Results from
+experiments show that our model exceeds large Diffusion-based model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compositional Learning in <span class="highlight-title">Transformer</span>-Based Human-Object Interaction
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikun Zhuang, Ruihao Qian, Chi Xie, Shuang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-object interaction (HOI) detection is an important part of
+understanding human activities and visual scenes. The long-tailed distribution
+of labeled instances is a primary challenge in HOI detection, promoting
+research in few-shot and zero-shot learning. Inspired by the combinatorial
+nature of HOI triplets, some existing approaches adopt the idea of
+compositional learning, in which object and action features are learned
+individually and re-composed as new training samples. However, these methods
+follow the CNN-based two-stage paradigm with limited feature extraction
+ability, and often rely on auxiliary information for better performance.
+Without introducing any additional information, we creatively propose a
+transformer-based framework for compositional HOI learning. Human-object pair
+representations and interaction representations are re-composed across
+different HOI instances, which involves richer contextual information and
+promotes the generalization of knowledge. Experiments show our simple but
+effective method achieves state-of-the-art performance, especially on rare HOI
+classes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Point Cloud Compression for Classification <span class="chip">SP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateen Ulhaq, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is increasingly being used to perform machine vision tasks such
+as classification, object detection, and segmentation on 3D point cloud data.
+However, deep learning inference is computationally expensive. The limited
+computational capabilities of end devices thus necessitate a codec for
+transmitting point cloud data over the network for server-side processing. Such
+a codec must be lightweight and capable of achieving high compression ratios
+without sacrificing accuracy. Motivated by this, we present a novel point cloud
+codec that is highly specialized for the machine task of classification. Our
+codec, based on PointNet, achieves a significantly better rate-accuracy
+trade-off in comparison to alternative methods. In particular, it achieves a
+94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40
+dataset. For low-resource end devices, we also propose two lightweight
+configurations of our encoder that achieve similar BD-bitrate reductions of 93%
+and 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and
+0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the
+potential of specialized codecs for machine analysis of point clouds, and
+provides a basis for extension to more complex tasks and datasets in the
+future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, IEEE MMSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Cai, Jiaming Lu, Jiewen Wang, Shuang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, sketch-based 3D shape retrieval has attracted growing
+attention. While many previous studies have focused on cross-modal matching
+between hand-drawn sketches and 3D shapes, the critical issue of how to handle
+low-quality and noisy samples in sketch data has been largely neglected. This
+paper presents an uncertainty-aware cross-modal transfer network (UACTN) that
+addresses this issue. UACTN decouples the representation learning of sketches
+and 3D shapes into two separate tasks: classification-based sketch uncertainty
+learning and 3D shape feature transfer. We first introduce an end-to-end
+classification-based approach that simultaneously learns sketch features and
+uncertainty, allowing uncertainty to prevent overfitting noisy sketches by
+assigning different levels of importance to clean and noisy sketches. Then, 3D
+shape features are mapped into the pre-learned sketch embedding space for
+feature alignment. Extensive experiments and ablation studies on two benchmarks
+demonstrate the superiority of our proposed method compared to state-of-the-art
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures; To be published in IEEE International Conference
+  on Multimedia and Expo 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FoodSAM: Any Food Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Lan, Jiayi Lyu, Hanyu Jiang, Kun Dong, Zehai Niu, Yi Zhang, Jian Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore the zero-shot capability of the Segment Anything
+Model (SAM) for food image segmentation. To address the lack of class-specific
+information in SAM-generated masks, we propose a novel framework, called
+FoodSAM. This innovative approach integrates the coarse semantic mask with
+SAM-generated masks to enhance semantic segmentation quality. Besides, we
+recognize that the ingredients in food can be supposed as independent
+individuals, which motivated us to perform instance segmentation on food
+images. Furthermore, FoodSAM extends its zero-shot capability to encompass
+panoptic segmentation by incorporating an object detector, which renders
+FoodSAM to effectively capture non-food object information. Drawing inspiration
+from the recent success of promptable segmentation, we also extend FoodSAM to
+promptable segmentation, supporting various prompt variants. Consequently,
+FoodSAM emerges as an all-encompassing solution capable of segmenting food
+items at multiple levels of granularity. Remarkably, this pioneering framework
+stands as the first-ever work to achieve instance, panoptic, and promptable
+segmentation on food images. Extensive experiments demonstrate the feasibility
+and impressing performance of FoodSAM, validating SAM's potential as a
+prominent and influential tool within the domain of food image segmentation. We
+release our code at https://github.com/jamesjg/FoodSAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/jamesjg/FoodSAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizing Event-Based Motion Deblurring in Real-World Scenarios <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Zhang, Lei Yu, Wen Yang, Jianzhuang Liu, Gui-Song Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based motion deblurring has shown promising results by exploiting
+low-latency events. However, current approaches are limited in their practical
+usage, as they assume the same spatial resolution of inputs and specific
+blurriness distributions. This work addresses these limitations and aims to
+generalize the performance of event-based deblurring in real-world scenarios.
+We propose a scale-aware network that allows flexible input spatial scales and
+enables learning from different temporal scales of motion blur. A two-stage
+self-supervised learning scheme is then developed to fit real-world data
+distribution. By utilizing the relativity of blurriness, our approach
+efficiently ensures the restored brightness and structure of latent images and
+further generalizes deblurring performance to handle varying spatial and
+temporal scales of motion blur in a self-distillation manner. Our method is
+extensively evaluated, demonstrating remarkable performance, and we also
+introduce a real-world dataset consisting of multi-scale blurry frames and
+events to facilitate research in event-based deblurring.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CaPhy: Capturing Physical Properties for Animatable Human Avatars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoqi Su, Liangxiao Hu, Siyou Lin, Hongwen Zhang, Shengping Zhang, Justus Thies, Yebin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CaPhy, a novel method for reconstructing animatable human avatars
+with realistic dynamic properties for clothing. Specifically, we aim for
+capturing the geometric and physical properties of the clothing from real
+observations. This allows us to apply novel poses to the human avatar with
+physically correct deformations and wrinkles of the clothing. To this end, we
+combine unsupervised training with physics-based losses and 3D-supervised
+training using scanned data to reconstruct a dynamic model of clothing that is
+physically realistic and conforms to the human scans. We also optimize the
+physical parameters of the underlying physical model from the scans by
+introducing gradient constraints of the physics-based losses. In contrast to
+previous work on 3D avatar reconstruction, our method is able to generalize to
+novel poses with realistic dynamic cloth deformations. Experiments on several
+subjects demonstrate that our method can estimate the physical properties of
+the garments, resulting in superior quantitative and qualitative results
+compared with previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BATINet: Background-Aware Text to Image Synthesis and Manipulation
+  Network <span class="chip">ICIP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryugo Morita, Zhiqiang Zhang, Jinjia Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background-Induced Text2Image (BIT2I) aims to generate foreground content
+according to the text on the given background image. Most studies focus on
+generating high-quality foreground content, although they ignore the
+relationship between the two contents. In this study, we analyzed a novel
+Background-Aware Text2Image (BAT2I) task in which the generated content matches
+the input background. We proposed a Background-Aware Text to Image synthesis
+and manipulation Network (BATINet), which contains two key components: Position
+Detect Network (PDN) and Harmonize Network (HN). The PDN detects the most
+plausible position of the text-relevant object in the background image. The HN
+harmonizes the generated content referring to background style information.
+Finally, we reconstructed the generation network, which consists of the
+multi-GAN and attention module to match more user preferences. Moreover, we can
+apply BATINet to text-guided image manipulation. It solves the most challenging
+task of manipulating the shape of an object. We demonstrated through
+qualitative and quantitative evaluations on the CUB dataset that the proposed
+model outperforms other state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICIP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantics2Hands: Transferring Hand Motion Semantics between Avatars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Ye, Jia Jia, Junliang Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human hands, the primary means of non-verbal communication, convey intricate
+semantics in various scenarios. Due to the high sensitivity of individuals to
+hand motions, even minor errors in hand motions can significantly impact the
+user experience. Real applications often involve multiple avatars with varying
+hand shapes, highlighting the importance of maintaining the intricate semantics
+of hand motions across the avatars. Therefore, this paper aims to transfer the
+hand motion semantics between diverse avatars based on their respective hand
+models. To address this problem, we introduce a novel anatomy-based semantic
+matrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the
+positions of the palm and other joints relative to the local frame of the
+corresponding joint, enabling precise retargeting of hand motions.
+Subsequently, we obtain a mapping function from the source ASM to the target
+hand joint rotations by employing an anatomy-based semantics reconstruction
+network (ASRN). We train the ASRN using a semi-supervised learning strategy on
+the Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain
+and cross-domain hand motion retargeting tasks. The qualitative and
+quantitative results demonstrate the significant superiority of our ASRN over
+the state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MM 2023, 9 pages, 10 figures. Project page:
+  https://abcyzj.github.io/S2H/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object
+  Tracking <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Liu, Junta Wu, Yi Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-object tracking (MOT) at low frame rates can reduce computational,
+storage and power overhead to better meet the constraints of edge devices. Many
+existing MOT methods suffer from significant performance degradation in
+low-frame-rate videos due to significant location and appearance changes
+between adjacent frames. To this end, we propose to explore collaborative
+tracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based
+end-to-end manner. Multiple historical queries of the same target jointly track
+it with richer temporal descriptions. Meanwhile, we insert an information
+refinement module between every two temporal blocking decoders to better fuse
+temporal clues and refine features. Moreover, a tracking object consistency
+loss is proposed to guide the interaction between historical queries. Extensive
+experimental results demonstrate that in high-frame-rate videos, ColTrack
+obtains higher performance than state-of-the-art methods on large-scale
+datasets Dancetrack and BDD100K, and outperforms the existing end-to-end
+methods on MOT17. More importantly, ColTrack has a significant advantage over
+state-of-the-art methods in low-frame-rate videos, which allows it to obtain
+faster processing speeds by reducing frame-rate requirements while maintaining
+higher performance. Code will be released at
+https://github.com/yolomax/ColTrack
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-embedded Similarity Prototype for Scene Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanxin Song, Hanbo Wu, Xin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the high inter-class similarity caused by the complex composition
+within scenes and the co-existing objects across scenes, various studies have
+explored object semantic knowledge within scenes to improve scene recognition.
+However, a resulting issue arises as semantic segmentation or object detection
+techniques demand heavy computational power, thereby burdening the network
+considerably. This limitation often renders object-assisted approaches
+incompatible with edge devices. In contrast, this paper proposes a
+semantic-based similarity prototype that assists the scene recognition network
+to achieve higher accuracy without increasing network parameters. It is simple
+and can be plug-and-played into existing pipelines. More specifically, a
+statistical strategy is introduced to depict semantic knowledge in scenes as
+class-level semantic representations. These representations are utilized to
+explore inter-class correlations, ultimately constructing a similarity
+prototype. Furthermore, we propose two ways to use the similarity prototype to
+support network training from the perspective of gradient label softening and
+batch-level contrastive loss, respectively. Comprehensive evaluations on
+multiple benchmarks show that our similarity prototype enhances the performance
+of existing networks without adding any computational burden. Code and the
+statistical similarity prototype will be available soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RT-1: Robotics <span class="highlight-title">Transformer</span> for Real-World Control at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Brohan, Noah Brown, Justice Carbajal, Yevgen Chebotar, Joseph Dabis, Chelsea Finn, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Tomas Jackson, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Isabel Leal, Kuang-Huei Lee, Sergey Levine, Yao Lu, Utsav Malla, Deeksha Manjunath, Igor Mordatch, Ofir Nachum, Carolina Parada, Jodilyn Peralta, Emily Perez, Karl Pertsch, Jornell Quiambao, Kanishka Rao, Michael Ryoo, Grecia Salazar, Pannag Sanketi, Kevin Sayed, Jaspiar Singh, Sumedh Sontakke, Austin Stone, Clayton Tan, Huong Tran, Vincent Vanhoucke, Steve Vega, Quan Vuong, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Tianhe Yu, Brianna Zitkovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By transferring knowledge from large, diverse, task-agnostic datasets, modern
+machine learning models can solve specific downstream tasks either zero-shot or
+with small task-specific datasets to a high level of performance. While this
+capability has been demonstrated in other fields such as computer vision,
+natural language processing or speech recognition, it remains to be shown in
+robotics, where the generalization capabilities of the models are particularly
+critical due to the difficulty of collecting real-world robotic data. We argue
+that one of the keys to the success of such general robotic models lies with
+open-ended task-agnostic training, combined with high-capacity architectures
+that can absorb all of the diverse, robotic data. In this paper, we present a
+model class, dubbed Robotics Transformer, that exhibits promising scalable
+model properties. We verify our conclusions in a study of different model
+classes and their ability to generalize as a function of the data size, model
+size, and data diversity based on a large-scale data collection on real robots
+performing real-world tasks. The project's website and videos can be found at
+robotics-transformer1.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See website at robotics-transformer1.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ F?D: On understanding the role of deep feature spaces on face generation
+  evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.20048v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.20048v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krish Kabra, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perceptual metrics, like the Fr\'echet Inception Distance (FID), are widely
+used to assess the similarity between synthetically generated and ground truth
+(real) images. The key idea behind these metrics is to compute errors in a deep
+feature space that captures perceptually and semantically rich image features.
+Despite their popularity, the effect that different deep features and their
+design choices have on a perceptual metric has not been well studied. In this
+work, we perform a causal analysis linking differences in semantic attributes
+and distortions between face image distributions to Fr\'echet distances (FD)
+using several popular deep feature spaces. A key component of our analysis is
+the creation of synthetic counterfactual faces using deep face generators. Our
+experiments show that the FD is heavily influenced by its feature space's
+training dataset and objective function. For example, FD using features
+extracted from ImageNet-trained models heavily emphasize hats over regions like
+the eyes and mouth. Moreover, FD using features from a face gender classifier
+emphasize hair length more than distances in an identity (recognition) feature
+space. Finally, we evaluate several popular face generation models across
+feature spaces and find that StyleGAN2 consistently ranks higher than other
+face generators, except with respect to identity (recognition) features. This
+suggests the need for considering multiple feature spaces when evaluating
+generative models and using feature spaces that are tuned to nuances of the
+domain of interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and dataset to be released soon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preventing Zero-Shot Transfer Degradation in Continual Learning of
+  Vision-Language Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zangwei Zheng, Mingyuan Ma, Kai Wang, Ziheng Qin, Xiangyu Yue, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) can help pre-trained vision-language models
+efficiently adapt to new or under-trained data distributions without
+re-training. Nevertheless, during the continual training of the Contrastive
+Language-Image Pre-training (CLIP) model, we observe that the model's zero-shot
+transfer ability significantly degrades due to catastrophic forgetting.
+Existing CL methods can mitigate forgetting by replaying previous data.
+However, since the CLIP dataset is private, replay methods cannot access the
+pre-training dataset. In addition, replaying data of previously learned
+downstream tasks can enhance their performance but comes at the cost of
+sacrificing zero-shot performance. To address this challenge, we propose a
+novel method ZSCL to prevent zero-shot transfer degradation in the continual
+learning of vision-language models in both feature and parameter space. In the
+feature space, a reference dataset is introduced for distillation between the
+current and initial models. The reference dataset should have semantic
+diversity but no need to be labeled, seen in pre-training, or matched
+image-text pairs. In parameter space, we prevent a large parameter shift by
+averaging weights during the training. We propose a more challenging
+Multi-domain Task Incremental Learning (MTIL) benchmark to evaluate different
+methods, where tasks are from various domains instead of class-separated in a
+single dataset. Our method outperforms other methods in the traditional
+class-incremental learning setting and the MTIL by 9.7% average score. Our code
+locates at https://github.com/Thunderbeee/ZSCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aysen Degerli, Pekka Jakala, Juha Pajula, Milla Immonen, Miguel Bordallo Lopez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stroke is a major cause of mortality and disability worldwide from which one
+in four people are in danger of incurring in their lifetime. The pre-hospital
+stroke assessment plays a vital role in identifying stroke patients accurately
+to accelerate further examination and treatment in hospitals. Accordingly, the
+National Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital
+Stroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests
+for stroke assessment. However, the validity of these tests is skeptical in the
+absence of neurologists and access to healthcare may be limited. Therefore, in
+this study, we propose a motion-aware and multi-attention fusion network
+(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary
+to other studies on stroke detection from video analysis, our study for the
+first time proposes an end-to-end solution from multiple video recordings of
+each subject with a dataset encapsulating stroke, transient ischemic attack
+(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware
+modules to sense the mobility of patients, attention modules to fuse the
+multi-input video data, and 3D convolutional layers to perform diagnosis from
+the attention-based extracted features. Experimental results over the collected
+Stroke-data dataset show that the proposed MAMAF-Net achieves a successful
+detection of stroke with 93.62% sensitivity and 95.33% AUC score.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Larger is not Better: A <span class="highlight-title">Survey</span> on the Robustness of Computer Vision
+  Models against Common Corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunxin Wang, Raymond Veldhuis, Christoph Brune, Nicola Strisciuglio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of computer vision models are susceptible to unexpected
+changes in input images, known as common corruptions (e.g. noise, blur,
+illumination changes, etc.), that can hinder their reliability when deployed in
+real scenarios. These corruptions are not always considered to test model
+generalization and robustness. In this survey, we present a comprehensive
+overview of methods that improve the robustness of computer vision models
+against common corruptions. We categorize methods into four groups based on the
+model part and training method addressed: data augmentation, representation
+learning, knowledge distillation, and network components. We also cover
+indirect methods for generalization and mitigation of shortcut learning,
+potentially useful for corruption robustness. We release a unified benchmark
+framework to compare robustness performance on several datasets, and address
+the inconsistencies of evaluation in the literature. We provide an experimental
+overview of the base corruption robustness of popular vision backbones, and
+show that corruption robustness does not necessarily scale with model size. The
+very large models (above 100M parameters) gain negligible robustness,
+considering the increased computational requirements. To achieve generalizable
+and robust computer vision models, we foresee the need of developing new
+learning strategies to efficiently exploit limited data and mitigate unwanted
+or unreliable learning behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Polarization Multi-Image Synthesis with Birefringent Metasurfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08106v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08106v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dean Hazineh, Soon Wei Daniel Lim, Qi Guo, Federico Capasso, Todd Zickler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical metasurfaces composed of precisely engineered nanostructures have
+gained significant attention for their ability to manipulate light and
+implement distinct functionalities based on the properties of the incident
+field. Computational imaging systems have started harnessing this capability to
+produce sets of coded measurements that benefit certain tasks when paired with
+digital post-processing. Inspired by these works, we introduce a new system
+that uses a birefringent metasurface with a polarizer-mosaicked photosensor to
+capture four optically-coded measurements in a single exposure. We apply this
+system to the task of incoherent opto-electronic filtering, where digital
+spatial-filtering operations are replaced by simpler, per-pixel sums across the
+four polarization channels, independent of the spatial filter size. In contrast
+to previous work on incoherent opto-electronic filtering that can realize only
+one spatial filter, our approach can realize a continuous family of filters
+from a single capture, with filters being selected from the family by adjusting
+the post-capture digital summation weights. To find a metasurface that can
+realize a set of user-specified spatial filters, we introduce a form of
+gradient descent with a novel regularizer that encourages light efficiency and
+a high signal-to-noise ratio. We demonstrate several examples in simulation and
+with fabricated prototypes, including some with spatial filters that have
+prescribed variations with respect to depth and wavelength.
+  Visit the Project Page at
+https://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Proceedings of the 2023 IEEE International
+  Conference of Computational Photography</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiT: Efficient Vision <span class="highlight-title">Transformer</span>s with Dynamic Token Routing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Ma, Zhengcong Fei, Junshi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the tokens of images share the same static data flow in many dense
+networks. However, challenges arise from the variance among the objects in
+images, such as large variations in the spatial scale and difficulties of
+recognition for visual entities. In this paper, we propose a data-dependent
+token routing strategy to elaborate the routing paths of image tokens for
+Dynamic Vision Transformer, dubbed DiT. The proposed framework generates a
+data-dependent path per token, adapting to the object scales and visual
+discrimination of tokens. In feed-forward, the differentiable routing gates are
+designed to select the scaling paths and feature transformation paths for image
+tokens, leading to multi-path feature propagation. In this way, the impact of
+object scales and visual discrimination of image representation can be
+carefully tuned. Moreover, the computational cost can be further reduced by
+giving budget constraints to the routing gate and early-stopping of feature
+extraction. In experiments, our DiT achieves superior performance and favorable
+complexity/accuracy trade-offs than many SoTA methods on ImageNet
+classification, object detection, instance segmentation, and semantic
+segmentation. Particularly, the DiT-B5 obtains 84.8\% top-1 Acc on ImageNet
+with 10.3 GFLOPs, which is 1.0\% higher than that of the SoTA method with
+similar computational complexity. These extensive results demonstrate that DiT
+can serve as versatile backbones for various vision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heatmap-based Out-of-Distribution Detection <span class="chip">WACV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia Hornauer, Vasileios Belagiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work investigates out-of-distribution (OOD) detection as a neural network
+output explanation problem. We learn a heatmap representation for detecting OOD
+images while visualizing in- and out-of-distribution image regions at the same
+time. Given a trained and fixed classifier, we train a decoder neural network
+to produce heatmaps with zero response for in-distribution samples and high
+response heatmaps for OOD samples, based on the classifier features and the
+class prediction. Our main innovation lies in the heatmap definition for an OOD
+sample, as the normalized difference from the closest in-distribution sample.
+The heatmap serves as a margin to distinguish between in- and
+out-of-distribution samples. Our approach generates the heatmaps not only for
+OOD detection, but also to indicate in- and out-of-distribution regions of the
+input image. In our evaluations, our approach mostly outperforms the prior work
+on fixed classifiers, trained on CIFAR-10, CIFAR-100 and Tiny ImageNet. The
+code is publicly available at: https://github.com/jhornauer/heatmap_ood.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WACV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Defending Multiple $\ell_p$-norm Bounded Adversarial
+  Perturbations via Gated Batch Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2012.01654v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2012.01654v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishan Liu, Shiyu Tang, Xinyun Chen, Lei Huang, Haotong Qin, Xianglong Liu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been extensive evidence demonstrating that deep neural networks are
+vulnerable to adversarial examples, which motivates the development of defenses
+against adversarial attacks. Existing adversarial defenses typically improve
+model robustness against individual specific perturbation types (\eg,
+$\ell_{\infty}$-norm bounded adversarial examples). However, adversaries are
+likely to generate multiple types of perturbations in practice (\eg, $\ell_1$,
+$\ell_2$, and $\ell_{\infty}$ perturbations). Some recent methods improve model
+robustness against adversarial attacks in multiple $\ell_p$ balls, but their
+performance against each perturbation type is still far from satisfactory. In
+this paper, we observe that different $\ell_p$ bounded adversarial
+perturbations induce different statistical properties that can be separated and
+characterized by the statistics of Batch Normalization (BN). We thus propose
+Gated Batch Normalization (GBN) to adversarially train a perturbation-invariant
+predictor for defending multiple $\ell_p$ bounded adversarial perturbations.
+GBN consists of a multi-branch BN layer and a gated sub-network. Each BN branch
+in GBN is in charge of one perturbation type to ensure that the normalized
+output is aligned towards learning perturbation-invariant representation.
+Meanwhile, the gated sub-network is designed to separate inputs added with
+different perturbation types. We perform an extensive evaluation of our
+approach on commonly-used dataset including MNIST, CIFAR-10, and Tiny-ImageNet,
+and demonstrate that GBN outperforms previous defense proposals against
+multiple perturbation types (\ie, $\ell_1$, $\ell_2$, and $\ell_{\infty}$
+perturbations) by large margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Self-Supervised</span> Representation Learning Methods Withstand
+  Distribution Shifts and Corruptions? <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02525v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02525v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prakash Chandra Chhipa, Johan Rodahl Holmgren, Kanjar De, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning in computer vision aims to leverage the inherent
+structure and relationships within data to learn meaningful representations
+without explicit human annotation, enabling a holistic understanding of visual
+scenes. Robustness in vision machine learning ensures reliable and consistent
+performance, enhancing generalization, adaptability, and resistance to noise,
+variations, and adversarial attacks. Self-supervised paradigms, namely
+contrastive learning, knowledge distillation, mutual information maximization,
+and clustering, have been considered to have shown advances in invariant
+learning representations. This work investigates the robustness of learned
+representations of self-supervised learning approaches focusing on distribution
+shifts and image corruptions in computer vision. Detailed experiments have been
+conducted to study the robustness of self-supervised learning methods on
+distribution shifts and image corruptions. The empirical analysis demonstrates
+a clear relationship between the performance of learned representations within
+self-supervised paradigms and the severity of distribution shifts and
+corruptions. Notably, higher levels of shifts and corruptions are found to
+significantly diminish the robustness of the learned representations. These
+findings highlight the critical impact of distribution shifts and image
+corruptions on the performance and resilience of self-supervised learning
+methods, emphasizing the need for effective strategies to mitigate their
+adverse effects. The study strongly advocates for future research in the field
+of self-supervised representation learning to prioritize the key aspects of
+safety and robustness in order to ensure practical applicability. The source
+code and results are available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 2023 IEEE/CVF International Conference on Computer Vision
+  Workshops (ICCVW). Corresponding author - prakash.chandra.chhipa@ltu.se</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Transferability of Adversarial Examples via Direction
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyuan Yang, Jie Lin, Hanlin Zhang, Xinyu Yang, Peng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the transfer-based adversarial attacks, adversarial examples are only
+generated by the surrogate models and achieve effective perturbation in the
+victim models. Although considerable efforts have been developed on improving
+the transferability of adversarial examples generated by transfer-based
+adversarial attacks, our investigation found that, the big deviation between
+the actual and steepest update directions of the current transfer-based
+adversarial attacks is caused by the large update step length, resulting in the
+generated adversarial examples can not converge well. However, directly
+reducing the update step length will lead to serious update oscillation so that
+the generated adversarial examples also can not achieve great transferability
+to the victim models. To address these issues, a novel transfer-based attack,
+namely direction tuning attack, is proposed to not only decrease the update
+deviation in the large step length, but also mitigate the update oscillation in
+the small sampling step length, thereby making the generated adversarial
+examples converge well to achieve great transferability on victim models. In
+addition, a network pruning method is proposed to smooth the decision boundary,
+thereby further decreasing the update oscillation and enhancing the
+transferability of the generated adversarial examples. The experiment results
+on ImageNet demonstrate that the average attack success rate (ASR) of the
+adversarial examples generated by our method can be improved from 87.9\% to
+94.5\% on five victim models without defenses, and from 69.1\% to 76.2\% on
+eight advanced defense methods, in comparison with that of latest
+gradient-based attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by INS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Design Fundamentals of Diffusion Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04542v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04542v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Chang, George Alex Koulieris, Hubert P. H. Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are generative models, which gradually add and remove noise
+to learn the underlying distribution of training data for data generation. The
+components of diffusion models have gained significant attention with many
+design choices proposed. Existing reviews have primarily focused on
+higher-level solutions, thereby covering less on the design fundamentals of
+components. This study seeks to address this gap by providing a comprehensive
+and coherent review on component-wise design choices in diffusion models.
+Specifically, we organize this review according to their three key components,
+namely the forward process, the reverse process, and the sampling procedure.
+This allows us to provide a fine-grained perspective of diffusion models,
+benefiting future studies in the analysis of individual components, the
+applicability of design choices, and the implementation of diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured 2D Representation of 3D Data for Shape Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1903.10360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1903.10360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kripasindhu Sarkar, Elizabeth Mathews, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We represent 3D shape by structured 2D representations of fixed length making
+it feasible to apply well investigated 2D convolutional neural networks (CNN)
+for both discriminative and geometric tasks on 3D shapes. We first provide a
+general introduction to such structured descriptors, analyze their different
+forms and show how a simple 2D CNN can be used to achieve good classification
+result. With a specialized classification network for images and our structured
+representation, we achieve the classification accuracy of 99.7\% in the
+ModelNet40 test set - improving the previous state-of-the-art by a large
+margin. We finally provide a novel framework for performing the geometric task
+of 3D segmentation using 2D CNNs and the structured representation - concluding
+the utility of such descriptors for both discriminative and geometric tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Results of some of the experiments were incorrect</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNAEN: Unsupervised Abnormality Extraction Network for MRI Motion
+  Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01732v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01732v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusheng Zhou, Hao Li, Jianan Liu, Zhengmin Kong, Tao Huang, Euijoon Ahn, Zhihan Lv, Jinman Kim, David Dagan Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion artifacts compromise the quality of magnetic resonance imaging (MRI)
+and pose challenges to achieving diagnostic outcomes and image-guided
+therapies. In recent years, supervised deep learning approaches have emerged as
+successful solutions for motion artifact reduction (MAR). One disadvantage of
+these methods is their dependency on acquiring paired sets of motion
+artifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images
+for training purposes. Obtaining such image pairs is difficult and therefore
+limits the application of supervised training. In this paper, we propose a
+novel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this
+problem. Our network is capable of working with unpaired MA-corrupted and
+MA-free images. It converts the MA-corrupted images to MA-reduced images by
+extracting abnormalities from the MA-corrupted images using a proposed artifact
+extractor, which intercepts the residual artifact maps from the MA-corrupted MR
+images explicitly, and a reconstructor to restore the original input from the
+MA-reduced images. The performance of UNAEN was assessed by experimenting on
+various publicly available MRI datasets and comparing them with
+state-of-the-art methods. The quantitative evaluation demonstrates the
+superiority of UNAEN over alternative MAR methods and visually exhibits fewer
+residual artifacts. Our results substantiate the potential of UNAEN as a
+promising solution applicable in real-world clinical environments, with the
+capability to enhance diagnostic accuracy and facilitate image-guided
+therapies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale
+  Multi-Attribute and Language Search Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02898v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02898v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyu Yang, Yinan Zhou, Yaxiong Wang, Yujiao Wu, Li Zhu, Zhedong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a large Multi-Attribute and Language Search
+dataset for text-based person retrieval, called MALS, and explore the
+feasibility of performing pre-training on both attribute recognition and
+image-text matching tasks in one stone. In particular, MALS contains 1,510,330
+image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,
+and all images are annotated with 27 attributes. Considering the privacy
+concerns and annotation costs, we leverage the off-the-shelf diffusion models
+to generate the dataset. To verify the feasibility of learning from the
+generated data, we develop a new joint Attribute Prompt Learning and Text
+Matching Learning (APTM) framework, considering the shared knowledge between
+attribute and text. As the name implies, APTM contains an attribute prompt
+learning stream and a text matching learning stream. (1) The attribute prompt
+learning leverages the attribute prompts for image-attribute alignment, which
+enhances the text matching learning. (2) The text matching learning facilitates
+the representation learning on fine-grained details, and in turn, boosts the
+attribute prompt learning. Extensive experiments validate the effectiveness of
+the pre-training on MALS, achieving state-of-the-art retrieval performance via
+APTM on three challenging real-world benchmarks. In particular, APTM achieves a
+consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on
+CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object
+  Detection <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.15157v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.15157v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Broedermann, Christos Sakaridis, Dengxin Dai, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Besides standard cameras, autonomous vehicles typically include multiple
+additional sensors, such as lidars and radars, which help acquire richer
+information for perceiving the content of the driving scene. While several
+recent works focus on fusing certain pairs of sensors - such as camera with
+lidar or radar - by using architectural components specific to the examined
+setting, a generic and modular sensor fusion architecture is missing from the
+literature. In this work, we propose HRFuser, a modular architecture for
+multi-modal 2D object detection. It fuses multiple sensors in a
+multi-resolution fashion and scales to an arbitrary number of input modalities.
+The design of HRFuser is based on state-of-the-art high-resolution networks for
+image-only dense prediction and incorporates a novel multi-window
+cross-attention block as the means to perform fusion of multiple modalities at
+multiple resolutions. We demonstrate via extensive experiments on nuScenes and
+the adverse conditions DENSE datasets that our model effectively leverages
+complementary features from additional modalities, substantially improving upon
+camera-only performance and consistently outperforming state-of-the-art 3D and
+2D fusion methods evaluated on 2D object detection metrics. The source code is
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE International Conference on Intelligent Transportation Systems
+  (ITSC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Integral Projection-based Semantic Autoencoder for Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Heyden, Habib Ullah, M. Salman Siddiqui, Fadi Al Machot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot Learning (ZSL) classification categorizes or predicts classes
+(labels) that are not included in the training set (unseen classes). Recent
+works proposed different semantic autoencoder (SAE) models where the encoder
+embeds a visual feature vector space into the semantic space and the decoder
+reconstructs the original visual feature space. The objective is to learn the
+embedding by leveraging a source data distribution, which can be applied
+effectively to a different but related target data distribution. Such
+embedding-based methods are prone to domain shift problems and are vulnerable
+to biases. We propose an integral projection-based semantic autoencoder
+(IP-SAE) where an encoder projects a visual feature space concatenated with the
+semantic space into a latent representation space. We force the decoder to
+reconstruct the visual-semantic data space. Due to this constraint, the
+visual-semantic projection function preserves the discriminatory data included
+inside the original visual feature space. The enriched projection forces a more
+precise reconstitution of the visual feature space invariant to the domain
+manifold. Consequently, the learned projection function is less domain-specific
+and alleviates the domain shift problem. Our proposed IP-SAE model consolidates
+a symmetric transformation function for embedding and projection, and thus, it
+provides transparency for interpreting generative applications in ZSL.
+Therefore, in addition to outperforming state-of-the-art methods considering
+four benchmark datasets, our analytical approach allows us to investigate
+distinct characteristics of generative-based methods in the unique context of
+zero-shot inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-to-Human Interaction Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhua Wang, Kaining Ying, Jiajun Meng, Jifeng Ning
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A comprehensive understanding of interested human-to-human interactions in
+video streams, such as queuing, handshaking, fighting and chasing, is of
+immense importance to the surveillance of public security in regions like
+campuses, squares and parks. Different from conventional human interaction
+recognition, which uses choreographed videos as inputs, neglects concurrent
+interactive groups, and performs detection and recognition in separate stages,
+we introduce a new task named human-to-human interaction detection (HID). HID
+devotes to detecting subjects, recognizing person-wise actions, and grouping
+people according to their interactive relations, in one model. First, based on
+the popular AVA dataset created for action detection, we establish a new HID
+benchmark, termed AVA-Interaction (AVA-I), by adding annotations on interactive
+relations in a frame-by-frame manner. AVA-I consists of 85,254 frames and
+86,338 interactive groups, and each image includes up to 4 concurrent
+interactive groups. Second, we present a novel baseline approach SaMFormer for
+HID, containing a visual feature extractor, a split stage which leverages a
+Transformer-based model to decode action instances and interactive groups, and
+a merging stage which reconstructs the relationship between instances and
+groups. All SaMFormer components are jointly trained in an end-to-end manner.
+Extensive experiments on AVA-I validate the superiority of SaMFormer over
+representative methods. The dataset and code will be made public to encourage
+more follow-up studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan-ang Gao, Beiwen Tian, Pengfei Li, Hao Zhao, Guyue Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of semi-supervised 3D object detection,
+which is of great importance considering the high annotation cost for cluttered
+3D indoor scenes. We resort to the robust and principled framework of
+selfteaching, which has triggered notable progress for semisupervised learning
+recently. While this paradigm is natural for image-level or pixel-level
+prediction, adapting it to the detection problem is challenged by the issue of
+proposal matching. Prior methods are based upon two-stage pipelines, matching
+heuristically selected proposals generated in the first stage and resulting in
+spatially sparse training signals. In contrast, we propose the first
+semisupervised 3D detection algorithm that works in the singlestage manner and
+allows spatially dense training signals. A fundamental issue of this new design
+is the quantization error caused by point-to-voxel discretization, which
+inevitably leads to misalignment between two transformed views in the voxel
+domain. To this end, we derive and implement closed-form rules that compensate
+this misalignment onthe-fly. Our results are significant, e.g., promoting
+ScanNet mAP@0.5 from 35.2% to 48.5% using 20% annotation. Codes and data will
+be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Code: https://github.com/AIR-DISCOVER/DQS3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic
+  Segmentation Using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Wu, Yuzhong Zhao, Mike Zheng Shou, Hong Zhou, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting and annotating images with pixel-wise labels is time-consuming and
+laborious. In contrast, synthetic data can be freely available using a
+generative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that
+it is possible to automatically obtain accurate semantic masks of synthetic
+images generated by the Off-the-shelf Stable Diffusion model, which uses only
+text-image pairs during training. Our approach, called DiffuMask, exploits the
+potential of the cross-attention map between text and image, which is natural
+and seamless to extend the text-driven image synthesis to semantic mask
+generation. DiffuMask uses text-guided cross-attention information to localize
+class/word-specific regions, which are combined with practical techniques to
+create a novel high-resolution and class-discriminative pixel-wise mask. The
+methods help to reduce data collection and annotation costs obviously.
+Experiments demonstrate that the existing segmentation methods trained on
+synthetic data of DiffuMask can achieve a competitive performance over the
+counterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),
+DiffuMask presents promising performance, close to the stateof-the-art result
+of real data (within 3% mIoU gap). Moreover, in the open-vocabulary
+segmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on
+Unseen class of VOC 2012. The project website can be found at
+https://weijiawu.github.io/DiffusionMask/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Multi-view Unsupervised Feature Selection and Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.08247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.08247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Si-Guo Fang, Dong Huang, Chang-Dong Wang, Yong Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant progress, previous multi-view unsupervised feature
+selection methods mostly suffer from two limitations. First, they generally
+utilize either cluster structure or similarity structure to guide the feature
+selection, which neglect the possibility of a joint formulation with mutual
+benefits. Second, they often learn the similarity structure by either global
+structure learning or local structure learning, which lack the capability of
+graph learning with both global and local structural awareness. In light of
+this, this paper presents a joint multi-view unsupervised feature selection and
+graph learning (JMVFG) approach. Particularly, we formulate the multi-view
+feature selection with orthogonal decomposition, where each target matrix is
+decomposed into a view-specific basis matrix and a view-consistent cluster
+indicator. The cross-space locality preservation is incorporated to bridge the
+cluster structure learning in the projected space and the similarity learning
+(i.e., graph learning) in the original space. Further, a unified objective
+function is presented to enable the simultaneous learning of the cluster
+structure, the global and local similarity structures, and the multi-view
+consistency and inconsistency, upon which an alternating optimization algorithm
+is developed with theoretically proved convergence. Extensive experiments on a
+variety of real-world multi-view datasets demonstrate the superiority of our
+approach for both the multi-view feature selection and graph learning tasks.
+The code is available at https://github.com/huangdonghere/JMVFG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in IEEE Transactions on Emerging Topics in Computational
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning
+  Consistent and Contrastive Feature Representations <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01558v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01558v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Pérez-Carrasco, Pavlos Protopapas, Guillermo Cabrera-Vives
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present Con$^{2}$DA, a simple framework that extends recent
+advances in semi-supervised learning to the semi-supervised domain adaptation
+(SSDA) problem. Our framework generates pairs of associated samples by
+performing stochastic data transformations to a given input. Associated data
+pairs are mapped to a feature representation space using a feature extractor.
+We use different loss functions to enforce consistency between the feature
+representations of associated data pairs of samples. We show that these learned
+representations are useful to deal with differences in data distributions in
+the domain adaptation problem. We performed experiments to study the main
+components of our model and we show that (i) learning of the consistent and
+contrastive feature representations is crucial to extract good discriminative
+features across different domains, and ii) our model benefits from the use of
+strong augmentation policies. With these findings, our method achieves
+state-of-the-art performances in three benchmark datasets for SSDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting
+  Methods and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for
+  Visible-Infrared Video Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03903v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03903v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huafeng Li, Le Xu, Yafei Zhang, Dapeng Tao, Zhengtao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In visible-infrared video person re-identification (re-ID), extracting
+features not affected by complex scenes (such as modality, camera views,
+pedestrian pose, background, etc.) changes, and mining and utilizing motion
+information are the keys to solving cross-modal pedestrian identity matching.
+To this end, the paper proposes a new visible-infrared video person re-ID
+method from a novel perspective, i.e., adversarial self-attack defense and
+spatial-temporal relation mining. In this work, the changes of views, posture,
+background and modal discrepancy are considered as the main factors that cause
+the perturbations of person identity features. Such interference information
+contained in the training samples is used as an adversarial perturbation. It
+performs adversarial attacks on the re-ID model during the training to make the
+model more robust to these unfavorable factors. The attack from the adversarial
+perturbation is introduced by activating the interference information contained
+in the input samples without generating adversarial samples, and it can be thus
+called adversarial self-attack. This design allows adversarial attack and
+defense to be integrated into one framework. This paper further proposes a
+spatial-temporal information-guided feature representation network to use the
+information in video sequences. The network cannot only extract the information
+contained in the video-frame sequences but also use the relation of the local
+information in space to guide the network to extract more robust features. The
+proposed method exhibits compelling performance on large-scale cross-modality
+video datasets. The source code of the proposed method will be released at
+https://github.com/lhf12278/xxx.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECLAD: Extracting Concepts with Local Aggregated Descriptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04531v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04531v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Felipe Posada-Moreno, Nikita Surya, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) are increasingly being used in critical
+systems, where robustness and alignment are crucial. In this context, the field
+of explainable artificial intelligence has proposed the generation of
+high-level explanations of the prediction process of CNNs through concept
+extraction. While these methods can detect whether or not a concept is present
+in an image, they are unable to determine its location. What is more, a fair
+comparison of such approaches is difficult due to a lack of proper validation
+procedures. To address these issues, we propose a novel method for automatic
+concept extraction and localization based on representations obtained through
+pixel-wise aggregations of CNN activation maps. Further, we introduce a process
+for the validation of concept-extraction techniques based on synthetic datasets
+with pixel-wise annotations of their main components, reducing the need for
+human intervention. Extensive experimentation on both synthetic and real-world
+datasets demonstrates that our method outperforms state-of-the-art
+alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Lane Detection through Self <span class="highlight-title">Pre-train</span>ing with Masked Sequential
+  Autoencoders and Fine-tuning with Customized PolyLoss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruohan Li, Yongqi Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane detection is crucial for vehicle localization which makes it the
+foundation for automated driving and many intelligent and advanced driving
+assistant systems. Available vision-based lane detection methods do not make
+full use of the valuable features and aggregate contextual information,
+especially the interrelationships between lane lines and other regions of the
+images in continuous frames. To fill this research gap and upgrade lane
+detection performance, this paper proposes a pipeline consisting of self
+pre-training with masked sequential autoencoders and fine-tuning with
+customized PolyLoss for the end-to-end neural network models using
+multi-continuous image frames. The masked sequential autoencoders are adopted
+to pre-train the neural network models with reconstructing the missing pixels
+from a random masked image as the objective. Then, in the fine-tuning
+segmentation phase where lane detection segmentation is performed, the
+continuous image frames are served as the inputs, and the pre-trained model
+weights are transferred and further updated using the backpropagation mechanism
+with customized PolyLoss calculating the weighted errors between the output
+lane detection results and the labeled ground truth. Extensive experiment
+results demonstrate that, with the proposed pipeline, the lane detection model
+performance on both normal and challenging scenes can be advanced beyond the
+state-of-the-art, delivering the best testing accuracy (98.38%), precision
+(0.937), and F1-measure (0.924) on the normal scene testing set, together with
+the best overall accuracy (98.36%) and precision (0.844) in the challenging
+scene test set, while the training time can be substantially shortened.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, accepted by journal of IEEE Transactions on
+  Intelligent Transportation Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style
+  Transfer <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04830v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04830v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyang Chen, Zhiyong Wu, Runnan Li, Weihong Bao, Jun Ling, Xu Tan, Sheng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current talking face generation methods mainly focus on speech-lip
+synchronization. However, insufficient investigation on the facial talking
+style leads to a lifeless and monotonous avatar. Most previous works fail to
+imitate expressive styles from arbitrary video prompts and ensure the
+authenticity of the generated video. This paper proposes an unsupervised
+variational style transfer model (VAST) to vivify the neutral photo-realistic
+avatars. Our model consists of three key components: a style encoder that
+extracts facial style representations from the given video prompts; a hybrid
+facial expression decoder to model accurate speech-related movements; a
+variational style enhancer that enhances the style space to be highly
+expressive and meaningful. With our essential designs on facial style learning,
+our model is able to flexibly capture the expressive facial style from
+arbitrary video prompts and transfer it onto a personalized image renderer in a
+zero-shot manner. Experimental results demonstrate the proposed approach
+contributes to a more vivid talking avatar with higher authenticity and richer
+expressiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deformable Mixer <span class="highlight-title">Transformer</span> with Gating for Multi-Task Learning of
+  Dense Prediction <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyang Xu, Yibo Yang, Bernard Ghanemm, Lefei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CNNs and Transformers have their own advantages and both have been widely
+used for dense prediction in multi-task learning (MTL). Most of the current
+studies on MTL solely rely on CNN or Transformer. In this work, we present a
+novel MTL model by combining both merits of deformable CNN and query-based
+Transformer with shared gating for multi-task learning of dense prediction.
+This combination may offer a simple and efficient solution owing to its
+powerful and flexible task-specific learning and advantages of lower cost, less
+complexity and smaller parameters than the traditional MTL methods. We
+introduce deformable mixer Transformer with gating (DeMTG), a simple and
+effective encoder-decoder architecture up-to-date that incorporates the
+convolution and attention mechanism in a unified network for MTL. It is
+exquisitely designed to use advantages of each block, and provide deformable
+and comprehensive features for all tasks from local and global perspective.
+First, the deformable mixer encoder contains two types of operators: the
+channel-aware mixing operator leveraged to allow communication among different
+channels, and the spatial-aware deformable operator with deformable convolution
+applied to efficiently sample more informative spatial locations. Second, the
+task-aware gating transformer decoder is used to perform the task-specific
+predictions, in which task interaction block integrated with self-attention is
+applied to capture task interaction features, and the task query block
+integrated with gating attention is leveraged to select corresponding
+task-specific features. Further, the experiment results demonstrate that the
+proposed DeMTG uses fewer GFLOPs and significantly outperforms current
+Transformer-based and CNN-based competitive models on a variety of metrics on
+three dense prediction datasets. Our code and models are available at
+https://github.com/yangyangxu0/DeMTG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IJCV; an extension to our previous AAAI 2023 paper
+  arXiv:2301.03461</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ O2CTA: Introducing Annotations from OCT to CCTA in Coronary Plaque
+  Analysis <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Li, Kexin Li, Yafeng Zhou, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Targeted diagnosis and treatment plans for patients with coronary artery
+disease vary according to atherosclerotic plaque component. Coronary CT
+angiography (CCTA) is widely used for artery imaging and determining the
+stenosis degree. However, the limited spatial resolution and susceptibility to
+artifacts fail CCTA in obtaining lumen morphological characteristics and plaque
+composition. It can be settled by invasive optical coherence tomography (OCT)
+without much trouble for physicians, but bringing higher costs and potential
+risks to patients. Therefore, it is clinically critical to introduce
+annotations of plaque tissue and lumen characteristics from OCT to paired CCTA
+scans, denoted as \textbf{the O2CTA problem} in this paper. We propose a method
+to handle the O2CTA problem. CCTA scans are first reconstructed into
+multi-planar reformatted (MPR) images, which agree with OCT images in term of
+semantic contents. The artery segment in OCT, which is manually labelled, is
+then spatially aligned with the entire artery in MPR images via the proposed
+alignment strategy. Finally, a classification model involving a 3D CNN and a
+Transformer, is learned to extract local features and capture dependence along
+arteries. Experiments on 55 paired OCT and CCTA we curate demonstrate that it
+is feasible to classify the CCTA based on the OCT labels, with an accuracy of
+86.2%, while the manual readings of OCT and CCTA vary significantly, with a
+Kappa coefficient of 0.113. We will make our source codes, models, data, and
+results publicly available to benefit the research community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation in MICCAI-BTSD 2023 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Aggregation <span class="highlight-title">Transformer</span> for Image Super-Resolution <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chen, Yulun Zhang, Jinjin Gu, Linghe Kong, Xiaokang Yang, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer has recently gained considerable popularity in low-level vision
+tasks, including image super-resolution (SR). These networks utilize
+self-attention along different dimensions, spatial or channel, and achieve
+impressive performance. This inspires us to combine the two dimensions in
+Transformer for a more powerful representation capability. Based on the above
+idea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),
+for image SR. Our DAT aggregates features across spatial and channel
+dimensions, in the inter-block and intra-block dual manner. Specifically, we
+alternately apply spatial and channel self-attention in consecutive Transformer
+blocks. The alternate strategy enables DAT to capture the global context and
+realize inter-block feature aggregation. Furthermore, we propose the adaptive
+interaction module (AIM) and the spatial-gate feed-forward network (SGFN) to
+achieve intra-block feature aggregation. AIM complements two self-attention
+mechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional
+non-linear spatial information in the feed-forward network. Extensive
+experiments show that our DAT surpasses current methods. Code and models are
+obtainable at https://github.com/zhengchen1999/DAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Code is available at
+  https://github.com/zhengchen1999/DAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Multimedia Event Extraction With Generated Images and Captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilin Du, Yunxin Li, Xu Guo, Yidan Sun, Boyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary news reporting increasingly features multimedia content,
+motivating research on multimedia event extraction. However, the task lacks
+annotated multimodal training data and artificially generated training data
+suffer from distribution shift from real-world data. In this paper, we propose
+Cross-modality Augmented Multimedia Event Learning (CAMEL), which successfully
+utilizes artificially generated multimodal training data and achieves
+state-of-the-art performance. We start with two labeled unimodal datasets in
+text and image respectively, and generate the missing modality using
+off-the-shelf image generators like Stable Diffusion and image captioners like
+BLIP. After that, we train the network on the resultant multimodal datasets. In
+order to learn robust features that are effective across domains, we devise an
+iterative and gradual training strategy. Substantial experiments show that
+CAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On
+multimedia events in particular, we outperform the prior SOTA by 4.2% F1 on
+event mention identification and by 9.8% F1 on argument identification, which
+indicates that CAMEL learns synergistic representations from the two
+modalities. Our work demonstrates a recipe to unleash the power of synthetic
+training data in structured prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalised Co-Salient Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09668v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09668v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Jing Zhang, Ruikai Cui, Kaihao Zhang, Weihao Li, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new setting that relaxes an assumption in the conventional
+Co-Salient Object Detection (CoSOD) setting by allowing the presence of "noisy
+images" which do not show the shared co-salient object. We call this new
+setting Generalised Co-Salient Object Detection (GCoSOD). We propose a novel
+random sampling based Generalised CoSOD Training (GCT) strategy to distill the
+awareness of inter-image absence of co-salient objects into CoSOD models. It
+employs a Diverse Sampling Self-Supervised Learning (DS3L) that, in addition to
+the provided supervised co-salient label, introduces additional self-supervised
+labels for noisy images (being null, that no co-salient object is present).
+Further, the random sampling process inherent in GCT enables the generation of
+a high-quality uncertainty map highlighting potential false-positive
+predictions at instance level. To evaluate the performance of CoSOD models
+under the GCoSOD setting, we propose two new testing datasets, namely
+CoCA-Common and CoCA-Zero, where a common salient object is partially present
+in the former and completely absent in the latter. Extensive experiments
+demonstrate that our proposed method significantly improves the performance of
+CoSOD models in terms of the performance under the GCoSOD setting as well as
+the model calibration degrees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Coordinate Projection Network for Sparse-View Computed
+  Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Wu, Ruimin Feng, Hongjiang Wei, Jingyi Yu, Yuyao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the present work, we propose a Self-supervised COordinate Projection
+nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV
+sinogram by solving the inverse tomography imaging problem. Compared with
+recent related works that solve similar problems using implicit neural
+representation network (INR), our essential contribution is an effective and
+simple re-projection strategy that pushes the tomography image reconstruction
+quality over supervised deep learning CT reconstruction works. The proposed
+strategy is inspired by the simple relationship between linear algebra and
+inverse problems. To solve the under-determined linear equation system, we
+first introduce INR to constrain the solution space via image continuity prior
+and achieve a rough solution. And secondly, we propose to generate a dense view
+sinogram that improves the rank of the linear equation system and produces a
+more stable CT image solution space. Our experiment results demonstrate that
+the re-projection strategy significantly improves the image reconstruction
+quality (+3 dB for PSNR at least). Besides, we integrate the recent hash
+encoding into our SCOPE model, which greatly accelerates the model training.
+Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction
+tasks. Experimental results indicate that the proposed SCOPE model outperforms
+two latest INR-based methods and two well-popular supervised DL methods
+quantitatively and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Segment Anything Model (SAM) for Medical Image Segmentation: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Rushi Jiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the flexibility of prompting, foundation models have become the
+dominant force in the domains of natural language processing and image
+generation. With the recent introduction of the Segment Anything Model (SAM),
+the prompt-driven paradigm has entered the realm of image segmentation,
+bringing with a range of previously unexplored capabilities. However, it
+remains unclear whether it can be applicable to medical image segmentation due
+to the significant differences between natural images and medical images.In
+this work, we summarize recent efforts to extend the success of SAM to medical
+image segmentation tasks, including both empirical benchmarking and
+methodological adaptations, and discuss potential future directions for SAM in
+medical image segmentation. Although directly applying SAM to medical image
+segmentation cannot obtain satisfying performance on multi-modal and
+multi-target medical datasets, many insights are drawn to guide future research
+to develop foundation models for medical image analysis. To facilitate future
+research, we maintain an active repository that contains up-to-date paper list
+and open-source project summary at https://github.com/YichiZhang98/SAM4MIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Undercover Deepfakes: Detecting Fake Segments in Videos <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjay Saha, Rashindrie Perera, Sachith Seneviratne, Tamasha Malepathirana, Sanka Rasnayaka, Deshani Geethika, Terence Sim, Saman Halgamuge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent renaissance in generative models, driven primarily by the advent
+of diffusion models and iterative improvement in GAN methods, has enabled many
+creative applications. However, each advancement is also accompanied by a rise
+in the potential for misuse. In the arena of the deepfake generation, this is a
+key societal issue. In particular, the ability to modify segments of videos
+using such generative techniques creates a new paradigm of deepfakes which are
+mostly real videos altered slightly to distort the truth.This paradigm has been
+under-explored by the current deepfake detection methods in the academic
+literature. In this paper, we present a deepfake detection method that can
+address this issue by performing deepfake prediction at the frame and video
+levels. To facilitate testing our method, we prepared a new benchmark dataset
+where videos have both real and fake frame sequences with very subtle
+transitions. We provide a benchmark on the proposed dataset with our detection
+method which utilizes the Vision Transformer based on Scaling and Shifting to
+learn spatial features, and a Timeseries Transformer to learn temporal features
+of the videos to help facilitate the interpretation of possible deepfakes.
+Extensive experiments on a variety of deepfake generation methods show
+excellent results by the proposed method on temporal segmentation and classical
+video-level predictions as well. In particular, the paradigm we address will
+form a powerful tool for the moderation of deepfakes, where human oversight can
+be better targeted to the parts of videos suspected of being deepfakes. All
+experiments can be reproduced at: https://t.ly/\_bOh9.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NIPD: A Federated Learning Person Detection Benchmark Based on
+  Real-World Non-IID Data <span class="chip">IJCAI 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangning Yin, Zhen Ding, Zhihua Dong, Dongsheng Chen, Jie Fu, Xinhui Ji, Guangqiang Yin, Zhiguo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL), a privacy-preserving distributed machine learning,
+has been rapidly applied in wireless communication networks. FL enables
+Internet of Things (IoT) clients to obtain well-trained models while preventing
+privacy leakage. Person detection can be deployed on edge devices with limited
+computing power if combined with FL to process the video data directly at the
+edge. However, due to the different hardware and deployment scenarios of
+different cameras, the data collected by the camera present non-independent and
+identically distributed (non-IID), and the global model derived from FL
+aggregation is less effective. Meanwhile, existing research lacks public data
+set for real-world FL object detection, which is not conducive to studying the
+non-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person
+detection (NIPD) data set, which is collected from five different cameras. To
+our knowledge, this is the first true device-based non-IID person detection
+data set. Based on this data set, we explain how to establish a FL experimental
+platform and provide a benchmark for non-IID person detection. NIPD is expected
+to promote the application of FL and the security of smart city.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CDistNet: Perceiving Multi-Domain Character Distance for Robust Text
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.11011v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.11011v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlun Zheng, Zhineng Chen, Shancheng Fang, Hongtao Xie, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer-based encoder-decoder framework is becoming popular in scene
+text recognition, largely because it naturally integrates recognition clues
+from both visual and semantic domains. However, recent studies show that the
+two kinds of clues are not always well registered and therefore, feature and
+character might be misaligned in difficult text (e.g., with a rare shape). As a
+result, constraints such as character position are introduced to alleviate this
+problem. Despite certain success, visual and semantic are still separately
+modeled and they are merely loosely associated. In this paper, we propose a
+novel module called Multi-Domain Character Distance Perception (MDCDP) to
+establish a visually and semantically related position embedding. MDCDP uses
+the position embedding to query both visual and semantic features following the
+cross-attention mechanism. The two kinds of clues are fused into the position
+branch, generating a content-aware embedding that well perceives character
+spacing and orientation variants, character semantic affinities, and clues
+tying the two kinds of information. They are summarized as the multi-domain
+character distance. We develop CDistNet that stacks multiple MDCDPs to guide a
+gradually precise distance modeling. Thus, the feature-character alignment is
+well built even various recognition difficulties are presented. We verify
+CDistNet on ten challenging public datasets and two series of augmented
+datasets created by ourselves. The experiments demonstrate that CDistNet
+performs highly competitively. It not only ranks top-tier in standard
+benchmarks, but also outperforms recent popular methods by obvious margins on
+real and augmented datasets presenting severe text deformation, poor linguistic
+support, and rare character layouts. Code is available at
+https://github.com/simplify23/CDistNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for publication at IJCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Linear Subspace Search: Learning to Construct Fast and
+  High-Quality Schedulers for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14677v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14677v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Chengyu Wang, Cen Chen, Jun Huang, Weining Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have become the most popular and powerful
+methods in the field of image synthesis, even rivaling human artists in
+artistic creativity. However, the key issue currently limiting the application
+of diffusion models is its extremely slow generation process. Although several
+methods were proposed to speed up the generation process, there still exists a
+trade-off between efficiency and quality. In this paper, we first provide a
+detailed theoretical and empirical analysis of the generation process of the
+diffusion models based on schedulers. We transform the designing problem of
+schedulers into the determination of several parameters, and further transform
+the accelerated generation process into an expansion process of the linear
+subspace. Based on these analyses, we consequently propose a novel method
+called Optimal Linear Subspace Search (OLSS), which accelerates the generation
+process by searching for the optimal approximation process of the complete
+generation process in the linear subspaces spanned by latent variables. OLSS is
+able to generate high-quality images with a very small number of steps. To
+demonstrate the effectiveness of our method, we conduct extensive comparative
+experiments on open-source diffusion models. Experimental results show that
+with a given number of steps, OLSS can significantly improve the quality of
+generated images. Using an NVIDIA A100 GPU, we make it possible to generate a
+high-quality image by Stable Diffusion within only one second without other
+optimization techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion
+  Aware Refraction-Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11219v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11219v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongcheng Li, Xiaoxiao Long, Yusen Wang, Tuo Cao, Wenping Wang, Fei Luo, Chunxia Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method, called NeTO, for capturing 3D geometry of solid
+transparent objects from 2D images via volume rendering. Reconstructing
+transparent objects is a very challenging task, which is ill-suited for
+general-purpose reconstruction techniques due to the specular light transport
+phenomena. Although existing refraction-tracing based methods, designed
+specially for this task, achieve impressive results, they still suffer from
+unstable optimization and loss of fine details, since the explicit surface
+representation they adopted is difficult to be optimized, and the
+self-occlusion problem is ignored for refraction-tracing. In this paper, we
+propose to leverage implicit Signed Distance Function (SDF) as surface
+representation, and optimize the SDF field via volume rendering with a
+self-occlusion aware refractive ray tracing. The implicit representation
+enables our method to be capable of reconstructing high-quality reconstruction
+even with a limited set of images, and the self-occlusion aware strategy makes
+it possible for our method to accurately reconstruct the self-occluded regions.
+Experiments show that our method achieves faithful reconstruction results and
+outperforms prior works by a large margin. Visit our project page at
+\url{https://www.xxlong.site/NeTO/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Experiments involving sparse views have some flaws, mainly including
+  Figure 1 in the introduction, Figure 7 and Table 1 in the experiments. In
+  order to maintain correctness and fairness, we would like to retract the
+  paper first</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Statistical Fidelity for Neural Image Compression with
+  Implicit Local Likelihood Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Muckley, Alaaeldin El-Nouby, Karen Ullrich, Hervé Jégou, Jakob Verbeek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossy image compression aims to represent images in as few bits as possible
+while maintaining fidelity to the original. Theoretical results indicate that
+optimizing distortion metrics such as PSNR or MS-SSIM necessarily leads to a
+discrepancy in the statistics of original images from those of reconstructions,
+in particular at low bitrates, often manifested by the blurring of the
+compressed images. Previous work has leveraged adversarial discriminators to
+improve statistical fidelity. Yet these binary discriminators adopted from
+generative modeling tasks may not be ideal for image compression. In this
+paper, we introduce a non-binary discriminator that is conditioned on quantized
+local image representations obtained via VQ-VAE autoencoders. Our evaluations
+on the CLIC2020, DIV2K and Kodak datasets show that our discriminator is more
+effective for jointly optimizing distortion (e.g., PSNR) and statistical
+fidelity (e.g., FID) than the PatchGAN of the state-of-the-art HiFiC model. On
+CLIC2020, we obtain the same FID as HiFiC with 30-40\% fewer bits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Upload camera-ready to arXiv. Official version available at
+  https://proceedings.mlr.press/v202/muckley23a.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Generalist Foundation Model for Radiology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we aim to initiate the development of Radiology Foundation
+Model, termed as RadFM.We consider the construction of foundational models from
+the perspectives of data, model design, and evaluation thoroughly. Our
+contribution can be concluded as follows: (i), we construct a large-scale
+Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.
+To the best of our knowledge, this is the first multi-modal dataset containing
+3D medical scans. (ii), We propose an architecture that enables visually
+conditioned generative pre-training, allowing for the integration of text input
+interleaved with 2D or 3D medical scans to generate response for diverse
+radiologic tasks. The model was initially pre-trained on MedMD and subsequently
+domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,
+containing 3M radiologic visual-language pairs. (iii), we propose a new
+evaluation benchmark that comprises five tasks, aiming to comprehensively
+assess the capability of foundation models in handling practical clinical
+problems. Our experimental results confirm that RadFM significantly outperforms
+existing multi-modal foundation models. The codes, data, and model checkpoint
+will all be made publicly available to promote further research and development
+in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Non-Local Spatial-Angular Correlation for Light Field Image
+  Super-Resolution <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08058v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08058v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyu Liang, Yingqian Wang, Longguang Wang, Jungang Yang, Shilin Zhou, Yulan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploiting spatial-angular correlation is crucial to light field (LF) image
+super-resolution (SR), but is highly challenging due to its non-local property
+caused by the disparities among LF images. Although many deep neural networks
+(DNNs) have been developed for LF image SR and achieved continuously improved
+performance, existing methods cannot well leverage the long-range
+spatial-angular correlation and thus suffer a significant performance drop when
+handling scenes with large disparity variations. In this paper, we propose a
+simple yet effective method to learn the non-local spatial-angular correlation
+for LF image SR. In our method, we adopt the epipolar plane image (EPI)
+representation to project the 4D spatial-angular correlation onto multiple 2D
+EPI planes, and then develop a Transformer network with repetitive
+self-attention operations to learn the spatial-angular correlation by modeling
+the dependencies between each pair of EPI pixels. Our method can fully
+incorporate the information from all angular views while achieving a global
+receptive field along the epipolar line. We conduct extensive experiments with
+insightful visualizations to validate the effectiveness of our method.
+Comparative results on five public datasets show that our method not only
+achieves state-of-the-art SR performance, but also performs robust to disparity
+variations. Code is publicly available at
+https://github.com/ZhengyuLiang24/EPIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Person Re-Identification without Identification via Event Anonymization <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04402v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04402v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafiq Ahmad, Pietro Morerio, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide-scale use of visual surveillance in public spaces puts individual
+privacy at stake while increasing resource consumption (energy, bandwidth, and
+computation). Neuromorphic vision sensors (event-cameras) have been recently
+considered a valid solution to the privacy issue because they do not capture
+detailed RGB visual information of the subjects in the scene. However, recent
+deep learning architectures have been able to reconstruct images from event
+cameras with high fidelity, reintroducing a potential threat to privacy for
+event-based vision applications. In this paper, we aim to anonymize
+event-streams to protect the identity of human subjects against such image
+reconstruction attacks. To achieve this, we propose an end-to-end network
+architecture jointly optimized for the twofold objective of preserving privacy
+and performing a downstream task such as person ReId. Our network learns to
+scramble events, enforcing the degradation of images recovered from the privacy
+attacker. In this work, we also bring to the community the first ever
+event-based person ReId dataset gathered to evaluate the performance of our
+approach. We validate our approach with extensive experiments and report
+results on the synthetic event data simulated from the publicly available
+SoftBio dataset and our proposed Event-ReId dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Computer Vision (ICCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Law of Data Separation in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangfeng He, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep learning has enabled significant advances in many areas of
+science, its black-box nature hinders architecture design for future artificial
+intelligence applications and interpretation for high-stakes decision makings.
+We addressed this issue by studying the fundamental question of how deep neural
+networks process data in the intermediate layers. Our finding is a simple and
+quantitative law that governs how deep neural networks separate data according
+to class membership throughout all layers for classification. This law shows
+that each layer improves data separation at a constant geometric rate, and its
+emergence is observed in a collection of network architectures and datasets
+during training. This law offers practical guidelines for designing
+architectures, improving model robustness and out-of-sample performance, as
+well as interpreting the predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at PNAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Training Challenges in Generative Adversarial Networks for
+  Biomedical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.07646v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.07646v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Muneeb Saad, Ruairi O'Reilly, Mubashir Husain Rehmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In biomedical image analysis, the applicability of deep learning methods is
+directly impacted by the quantity of image data available. This is due to deep
+learning models requiring large image datasets to provide high-level
+performance. Generative Adversarial Networks (GANs) have been widely utilized
+to address data limitations through the generation of synthetic biomedical
+images. GANs consist of two models. The generator, a model that learns how to
+produce synthetic images based on the feedback it receives. The discriminator,
+a model that classifies an image as synthetic or real and provides feedback to
+the generator. Throughout the training process, a GAN can experience several
+technical challenges that impede the generation of suitable synthetic imagery.
+First, the mode collapse problem whereby the generator either produces an
+identical image or produces a uniform image from distinct input features.
+Second, the non-convergence problem whereby the gradient descent optimizer
+fails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem
+whereby unstable training behavior occurs due to the discriminator achieving
+optimal classification performance resulting in no meaningful feedback being
+provided to the generator. These problems result in the production of synthetic
+imagery that is blurry, unrealistic, and less diverse. To date, there has been
+no survey article outlining the impact of these technical challenges in the
+context of the biomedical imagery domain. This work presents a review and
+taxonomy based on solutions to the training problems of GANs in the biomedical
+imaging domain. This survey highlights important challenges and outlines future
+research directions about the training of GANs in the domain of biomedical
+imagery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the AI Review Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Large Language Model Enhanced Conversational Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Feng, Shuchang Liu, Zhenghai Xue, Qingpeng Cai, Lantao Hu, Peng Jiang, Kun Gai, Fei Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommender systems (CRSs) aim to recommend high-quality items
+to users through a dialogue interface. It usually contains multiple sub-tasks,
+such as user preference elicitation, recommendation, explanation, and item
+information search. To develop effective CRSs, there are some challenges: 1)
+how to properly manage sub-tasks; 2) how to effectively solve different
+sub-tasks; and 3) how to correctly generate responses that interact with users.
+Recently, Large Language Models (LLMs) have exhibited an unprecedented ability
+to reason and generate, presenting a new opportunity to develop more powerful
+CRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to
+address the above challenges. For sub-task management, we leverage the
+reasoning ability of LLM to effectively manage sub-task. For sub-task solving,
+we collaborate LLM with expert models of different sub-tasks to achieve the
+enhanced performance. For response generation, we utilize the generation
+ability of LLM as a language interface to better interact with users.
+Specifically, LLMCRS divides the workflow into four stages: sub-task detection,
+model matching, sub-task execution, and response generation. LLMCRS also
+designs schema-based instruction, demonstration-based instruction, dynamic
+sub-task and model matching, and summary-based generation to instruct LLM to
+generate desired results in the workflow. Finally, to adapt LLM to
+conversational recommendations, we also propose to fine-tune LLM with
+reinforcement learning from CRSs performance feedback, referred to as RLPF.
+Experimental results on benchmark datasets show that LLMCRS with RLPF
+outperforms the existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identification of the Relevance of Comments in Codes Using Bag of Words
+  and <span class="highlight-title">Transformer</span> Based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sruthi S, Tanmay Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Forum for Information Retrieval (FIRE) started a shared task this year
+for classification of comments of different code segments. This is binary text
+classification task where the objective is to identify whether comments given
+for certain code segments are relevant or not. The BioNLP-IISERB group at the
+Indian Institute of Science Education and Research Bhopal (IISERB) participated
+in this task and submitted five runs for five different models. The paper
+presents the overview of the models and other significant findings on the
+training corpus. The methods involve different feature engineering schemes and
+text classification techniques. The performance of the classical bag of words
+model and transformer-based models were explored to identify significant
+features from the given training corpus. We have explored different classifiers
+viz., random forest, support vector machine and logistic regression using the
+bag of words model. Furthermore, the pre-trained transformer based models like
+BERT, RoBERT and ALBERT were also used by fine-tuning them on the given
+training corpus. The performance of different such models over the training
+corpus were reported and the best five models were implemented on the given
+test corpus. The empirical results show that the bag of words model outperforms
+the transformer based models, however, the performance of our runs are not
+reasonably well in both training and test corpus. This paper also addresses the
+limitations of the models and scope for further improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward a Better Understanding of Loss Functions for Collaborative
+  Filtering <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongmin Park, Mincheol Yoon, Jae-woong Lee, Hogun Park, Jongwuk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative filtering (CF) is a pivotal technique in modern recommender
+systems. The learning process of CF models typically consists of three
+components: interaction encoder, loss function, and negative sampling. Although
+many existing studies have proposed various CF models to design sophisticated
+interaction encoders, recent work shows that simply reformulating the loss
+functions can achieve significant performance gains. This paper delves into
+analyzing the relationship among existing loss functions. Our mathematical
+analysis reveals that the previous loss functions can be interpreted as
+alignment and uniformity functions: (i) the alignment matches user and item
+representations, and (ii) the uniformity disperses user and item distributions.
+Inspired by this analysis, we propose a novel loss function that improves the
+design of alignment and uniformity considering the unique patterns of datasets
+called Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty
+of MAWU is two-fold: (i) margin-aware alignment (MA) mitigates
+user/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts
+the significance between user and item uniformities to reflect the inherent
+characteristics of datasets. Extensive experimental results show that MF and
+LightGCN equipped with MAWU are comparable or superior to state-of-the-art CF
+models with various loss functions on three public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Context Interest Network for Click-Through Rate Prediction <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Hou, Zhe Wang, Qi Liu, Tan Qu, Jia Cheng, Jun Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-Through Rate (CTR) prediction, estimating the probability of a user
+clicking on an item, is essential in industrial applications, such as online
+advertising. Many works focus on user behavior modeling to improve CTR
+prediction performance. However, most of those methods only model users'
+positive interests from users' click items while ignoring the context
+information, which is the display items around the clicks, resulting in
+inferior performance. In this paper, we highlight the importance of context
+information on user behavior modeling and propose a novel model named Deep
+Context Interest Network (DCIN), which integrally models the click and its
+display context to learn users' context-aware interests. DCIN consists of three
+key modules: 1) Position-aware Context Aggregation Module (PCAM), which
+performs aggregation of display items with an attention mechanism; 2)
+Feedback-Context Fusion Module (FCFM), which fuses the representation of clicks
+and display contexts through non-linear feature interaction; 3) Interest
+Matching Module (IMM), which activates interests related with the target item.
+Moreover, we provide our hands-on solution to implement our DCIN model on
+large-scale industrial systems. The significant improvements in both offline
+and online evaluations demonstrate the superiority of our proposed DCIN method.
+Notably, DCIN has been deployed on our online advertising system serving the
+main traffic, which brings 1.5% CTR and 1.5% RPM lift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Designing a User Contextual Profile Ontology: A Focus on the Vehicle
+  Sales Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ngoc Luyen Le, Marie-Hélène Abel, Philippe Gouspillou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the digital age, it is crucial to understand and tailor experiences for
+users interacting with systems and applications. This requires the creation of
+user contextual profiles that combine user profiles with contextual
+information. However, there is a lack of research on the integration of
+contextual information with different user profiles. This study aims to address
+this gap by designing a user contextual profile ontology that considers both
+user profiles and contextual information on each profile. Specifically, we
+present a design and development of the user contextual profile ontology with a
+focus on the vehicle sales domain. Our designed ontology serves as a structural
+foundation for standardizing the representation of user profiles and contextual
+information, enhancing the system's ability to capture user preferences and
+contextual information of the user accurately. Moreover, we illustrate a case
+study using the User Contextual Profile Ontology in generating personalized
+recommendations for vehicle sales domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmented Negative Sampling for Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Zhao, Rui Chen, Riwei Lai, Qilong Han, Hongtao Song, Li Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Negative sampling is essential for implicit-feedback-based collaborative
+filtering, which is used to constitute negative signals from massive unlabeled
+data to guide supervised learning. The state-of-the-art idea is to utilize hard
+negative samples that carry more useful information to form a better decision
+boundary. To balance efficiency and effectiveness, the vast majority of
+existing methods follow the two-pass approach, in which the first pass samples
+a fixed number of unobserved items by a simple static distribution and then the
+second pass selects the final negative items using a more sophisticated
+negative sampling strategy. However, selecting negative samples from the
+original items is inherently restricted, and thus may not be able to contrast
+positive samples well. In this paper, we confirm this observation via
+experiments and introduce two limitations of existing solutions: ambiguous trap
+and information discrimination. Our response to such limitations is to
+introduce augmented negative samples. This direction renders a substantial
+technical challenge because constructing unconstrained negative samples may
+introduce excessive noise that distorts the decision boundary. To this end, we
+introduce a novel generic augmented negative sampling paradigm and provide a
+concrete instantiation. First, we disentangle hard and easy factors of negative
+items. Next, we generate new candidate negative samples by augmenting only the
+easy factors in a regulated manner: the direction and magnitude of the
+augmentation are carefully calibrated. Finally, we design an advanced negative
+sampling strategy to identify the final augmented negative samples, which
+considers not only the score function used in existing methods but also a new
+metric called augmentation gain. Extensive experiments on real-world datasets
+demonstrate that our method significantly outperforms state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 16 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LittleMu: Deploying an Online Virtual Teaching Assistant via
+  Heterogeneous Sources Integration and Chain of Teach <span class="highlight-title">Prompt</span>s <span class="chip">CIKM 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangqing Tu, Zheyuan Zhang, Jifan Yu, Chunyang Li, Siyu Zhang, Zijun Yao, Lei Hou, Juanzi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teaching assistants have played essential roles in the long history of
+education. However, few MOOC platforms are providing human or virtual teaching
+assistants to support learning for massive online students due to the
+complexity of real-world online education scenarios and the lack of training
+data. In this paper, we present a virtual MOOC teaching assistant, LittleMu
+with minimum labeled training data, to provide question answering and chit-chat
+services. Consisting of two interactive modules of heterogeneous retrieval and
+language model prompting, LittleMu first integrates structural, semi- and
+unstructured knowledge sources to support accurate answers for a wide range of
+questions. Then, we design delicate demonstrations named "Chain of Teach"
+prompts to exploit the large-scale pre-trained model to handle complex
+uncollected questions. Except for question answering, we develop other
+educational services such as knowledge-grounded chit-chat. We test the system's
+performance via both offline evaluation and online deployment. Since May 2020,
+our LittleMu system has served over 80,000 users with over 300,000 queries from
+over 500 courses on XuetangX MOOC platform, which continuously contributes to a
+more convenient and fair education. Our code, services, and dataset will be
+available at https://github.com/THU-KEG/VTA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, Accepted by CIKM 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LTP-MMF: Towards Long-term Provider Max-min Fairness Under
+  Recommendation Feedback Loops 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Xu, Xiaopeng Ye, Jun Xu, Xiao Zhang, Weiran Shen, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-stakeholder recommender systems involve various roles, such as users,
+providers. Previous work pointed out that max-min fairness (MMF) is a better
+metric to support weak providers. However, when considering MMF, the features
+or parameters of these roles vary over time, how to ensure long-term provider
+MMF has become a significant challenge. We observed that recommendation
+feedback loops (named RFL) will influence the provider MMF greatly in the long
+term. RFL means that recommender system can only receive feedback on exposed
+items from users and update recommender models incrementally based on this
+feedback. When utilizing the feedback, the recommender model will regard
+unexposed item as negative. In this way, tail provider will not get the
+opportunity to be exposed, and its items will always be considered as negative
+samples. Such phenomenons will become more and more serious in RFL. To
+alleviate the problem, this paper proposes an online ranking model named
+Long-Term Provider Max-min Fairness (named LTP-MMF). Theoretical analysis shows
+that the long-term regret of LTP-MMF enjoys a sub-linear bound. Experimental
+results on three public recommendation benchmarks demonstrated that LTP-MMF can
+outperform the baselines in the long term.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.06660</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topic-Level Bayesian Surprise and Serendipity for Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tonmoy Hasan, Razvan Bunescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recommender system that optimizes its recommendations solely to fit a
+user's history of ratings for consumed items can create a filter bubble,
+wherein the user does not get to experience items from novel, unseen
+categories. One approach to mitigate this undesired behavior is to recommend
+items with high potential for serendipity, namely surprising items that are
+likely to be highly rated. In this paper, we propose a content-based
+formulation of serendipity that is rooted in Bayesian surprise and use it to
+measure the serendipity of items after they are consumed and rated by the user.
+When coupled with a collaborative-filtering component that identifies similar
+users, this enables recommending items with high potential for serendipity. To
+facilitate the evaluation of topic-level models for surprise and serendipity,
+we introduce a dataset of book reading histories extracted from Goodreads,
+containing over 26 thousand users and close to 1.3 million books, where we
+manually annotate 449 books read by 4 users in terms of their time-dependent,
+topic-level surprise. Experimental evaluations show that models that use
+Bayesian surprise correlate much better with the manual annotations of
+topic-level surprise than distance-based heuristics, and also obtain better
+serendipitous item recommendation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lib-SibGMU -- A University Library Circulation <span class="highlight-title">Dataset</span> for Recommender
+  Systems Developmen 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.12356v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.12356v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduard Zubchuk, Mikhail Arhipkin, Dmitry Menshikov, Aleksandr Karaush, Nikolay Mikhaylovskiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We opensource under CC BY 4.0 license Lib-SibGMU - a university library
+circulation dataset - for a wide research community, and benchmark major
+algorithms for recommender systems on this dataset. For a recommender
+architecture that consists of a vectorizer that turns the history of the books
+borrowed into a vector, and a neighborhood-based recommender, trained
+separately, we show that using the fastText model as a vectorizer delivers
+competitive results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset copyright discussion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15464v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15464v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data catalogs play a crucial role in modern data-driven organizations by
+facilitating the discovery, understanding, and utilization of diverse data
+assets. However, ensuring their quality and reliability is complex, especially
+in open and large-scale data environments. This paper proposes a framework to
+automatically determine the quality of open data catalogs, addressing the need
+for efficient and reliable quality assessment mechanisms. Our framework can
+analyze various core quality dimensions, such as accuracy, completeness,
+consistency, scalability, and timeliness, offer several alternatives for the
+assessment of compatibility and similarity across such catalogs as well as the
+implementation of a set of non-core quality dimensions such as provenance,
+readability, and licensing. The goal is to empower data-driven organizations to
+make informed decisions based on trustworthy and well-curated data assets. The
+source code that illustrates our approach can be downloaded from
+https://www.github.com/jorge-martinez-gil/dataq/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaMCT: Adaptive Mixture of CNN-<span class="highlight-title">Transformer</span> for Sequential
+  Recommendation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.08776v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.08776v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juyong Jiang, Peiyan Zhang, Yingtao Luo, Chaozhuo Li, Jae Boum Kim, Kai Zhang, Senzhang Wang, Xing Xie, Sunghun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation (SR) aims to model users dynamic preferences from a
+series of interactions. A pivotal challenge in user modeling for SR lies in the
+inherent variability of user preferences. An effective SR model is expected to
+capture both the long-term and short-term preferences exhibited by users,
+wherein the former can offer a comprehensive understanding of stable interests
+that impact the latter. To more effectively capture such information, we
+incorporate locality inductive bias into the Transformer by amalgamating its
+global attention mechanism with a local convolutional filter, and adaptively
+ascertain the mixing importance on a personalized basis through layer-aware
+adaptive mixture units, termed as AdaMCT. Moreover, as users may repeatedly
+browse potential purchases, it is expected to consider multiple relevant items
+concurrently in long-/short-term preferences modeling. Given that softmax-based
+attention may promote unimodal activation, we propose the Squeeze-Excitation
+Attention (with sigmoid activation) into SR models to capture multiple
+pertinent items (keys) simultaneously. Extensive experiments on three widely
+employed benchmarks substantiate the effectiveness and efficiency of our
+proposed approach. Source code is available at
+https://github.com/juyongjiang/AdaMCT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Popularity Bias in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Klimashevskaia, Dietmar Jannach, Mehdi Elahi, Christoph Trattner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems help people find relevant content in a personalized way.
+One main promise of such systems is that they are able to increase the
+visibility of items in the long tail, i.e., the lesser-known items in a
+catalogue. Existing research, however, suggests that in many situations today's
+recommendation algorithms instead exhibit a popularity bias, meaning that they
+often focus on rather popular items in their recommendations. Such a bias may
+not only lead to limited value of the recommendations for consumers and
+providers in the short run, but it may also cause undesired reinforcement
+effects over time. In this paper, we discuss the potential reasons for
+popularity bias and we review existing approaches to detect, quantify and
+mitigate popularity bias in recommender systems. Our survey therefore includes
+both an overview of the computational metrics used in the literature as well as
+a review of the main technical approaches to reduce the bias. We furthermore
+critically discuss today's literature, where we observe that the research is
+almost entirely based on computational experiments and on certain assumptions
+regarding the practical effects of including long-tail items in the
+recommendations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojie Pan, Zepeng Zhai, Yuzhou Zhang, Ruiji Fu, Ming Liu, Yangqiu Song, Zhongyuan Wang, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online encyclopedias, such as Wikipedia, have been well-developed and
+researched in the last two decades. One can find any attributes or other
+information of a wiki item on a wiki page edited by a community of volunteers.
+However, the traditional text, images and tables can hardly express some
+aspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may
+care more about ``How to feed it'' or ``How to train it not to protect its
+food''. Currently, short-video platforms have become a hallmark in the online
+world. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,
+short-video apps have changed how we consume and create content today. Except
+for producing short videos for entertainment, we can find more and more authors
+sharing insightful knowledge widely across all walks of life. These short
+videos, which we call knowledge videos, can easily express any aspects (e.g.
+hair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and
+they can be systematically analyzed and organized like an online encyclopedia.
+In this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia
+consisting of items, aspects, and short videos lined to them, which was
+extracted from billions of videos of Kuaishou (Kwai), a well-known short-video
+platform in China. We first collected items from multiple sources and mined
+user-centered aspects from millions of users' queries to build an item-aspect
+tree. Then we propose a new task called ``multi-modal item-aspect linking'' as
+an expansion of ``entity linking'' to link short videos into item-aspect pairs
+and build the whole short-video encyclopedia. Intrinsic evaluations show that
+our encyclopedia is of large scale and highly accurate. We also conduct
+sufficient extrinsic experiments to show how Kuaipedia can help fundamental
+applications such as entity typing and entity linking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">103</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundation Model is Efficient Multimodal Multitask Model Selector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqing Meng, Wenqi Shao, Zhanglin Peng, Chonghe Jiang, Kaipeng Zhang, Yu Qiao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates an under-explored but important problem: given a
+collection of pre-trained neural networks, predicting their performance on each
+multi-modal task without fine-tuning them, such as image recognition,
+referring, captioning, visual question answering, and text question answering.
+A brute-force approach is to finetune all models on all target datasets,
+bringing high computational costs. Although recent-advanced approaches employed
+lightweight metrics to measure models' transferability,they often depend
+heavily on the prior knowledge of a single task, making them inapplicable in a
+multi-modal multi-task scenario. To tackle this issue, we propose an efficient
+multi-task model selector (EMMS), which employs large-scale foundation models
+to transform diverse label formats such as categories, texts, and bounding
+boxes of different downstream tasks into a unified noisy label embedding. EMMS
+can estimate a model's transferability through a simple weighted linear
+regression, which can be efficiently solved by an alternating minimization
+algorithm with a convergence guarantee. Extensive experiments on 5 downstream
+tasks with 24 datasets show that EMMS is fast, effective, and generic enough to
+assess the transferability of pre-trained models, making it the first model
+selection method in the multi-task scenario. For instance, compared with the
+state-of-the-art method LogME enhanced by our label embeddings, EMMS achieves
+9.0\%, 26.3\%, 20.1\%, 54.8\%, 12.2\% performance gain on image recognition,
+referring, captioning, visual question answering, and text question answering,
+while bringing 5.13x, 6.29x, 3.59x, 6.19x, and 5.66x speedup in wall-clock
+time, respectively. The code is available at
+https://github.com/OpenGVLab/Multitask-Model-Selector.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FunnyBirds: A Synthetic Vision <span class="highlight-title">Dataset</span> for a Part-Based Analysis of
+  Explainable AI Methods <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Hesse, Simone Schaub-Meyer, Stefan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of explainable artificial intelligence (XAI) aims to uncover the
+inner workings of complex deep neural models. While being crucial for
+safety-critical domains, XAI inherently lacks ground-truth explanations, making
+its automatic evaluation an unsolved problem. We address this challenge by
+proposing a novel synthetic vision dataset, named FunnyBirds, and accompanying
+automatic evaluation protocols. Our dataset allows performing semantically
+meaningful image interventions, e.g., removing individual object parts, which
+has three important implications. First, it enables analyzing explanations on a
+part level, which is closer to human comprehension than existing methods that
+evaluate on a pixel level. Second, by comparing the model output for inputs
+with removed parts, we can estimate ground-truth part importances that should
+be reflected in the explanations. Third, by mapping individual explanations
+into a common space of part importances, we can analyze a variety of different
+explanation types in a single common framework. Using our tools, we report
+results for 24 different combinations of neural models and XAI methods,
+demonstrating the strengths and weaknesses of the assessed methods in a fully
+automatic and systematic manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Private Distribution Learning with Public Data: The View from Sample
+  Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shai Ben-David, Alex Bie, Clément L. Canonne, Gautam Kamath, Vikrant Singhal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of private distribution learning with access to public
+data. In this setup, which we refer to as public-private learning, the learner
+is given public and private samples drawn from an unknown distribution $p$
+belonging to a class $\mathcal Q$, with the goal of outputting an estimate of
+$p$ while adhering to privacy constraints (here, pure differential privacy)
+only with respect to the private samples.
+  We show that the public-private learnability of a class $\mathcal Q$ is
+connected to the existence of a sample compression scheme for $\mathcal Q$, as
+well as to an intermediate notion we refer to as list learning. Leveraging this
+connection: (1) approximately recovers previous results on Gaussians over
+$\mathbb R^d$; and (2) leads to new ones, including sample complexity upper
+bounds for arbitrary $k$-mixtures of Gaussians over $\mathbb R^d$, results for
+agnostic and distribution-shift resistant learners, as well as closure
+properties for public-private learnability under taking mixtures and products
+of distributions. Finally, via the connection to list learning, we show that
+for Gaussians in $\mathbb R^d$, at least $d$ public samples are necessary for
+private learnability, which is close to the known upper bound of $d+1$ public
+samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MaxFloodCast: Ensemble Machine Learning Model for Predicting Peak
+  Inundation Depth And Decoding Influencing Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Chun Lee, Lipai Huang, Federico Antolini, Matthew Garcia, Andrew Juanb, Samuel D. Brody, Ali Mostafavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Timely, accurate, and reliable information is essential for decision-makers,
+emergency managers, and infrastructure operators during flood events. This
+study demonstrates a proposed machine learning model, MaxFloodCast, trained on
+physics-based hydrodynamic simulations in Harris County, offers efficient and
+interpretable flood inundation depth predictions. Achieving an average
+R-squared of 0.949 and a Root Mean Square Error of 0.61 ft on unseen data, it
+proves reliable in forecasting peak flood inundation depths. Validated against
+Hurricane Harvey and Storm Imelda, MaxFloodCast shows the potential in
+supporting near-time floodplain management and emergency operations. The
+model's interpretability aids decision-makers in offering critical information
+to inform flood mitigation strategies, to prioritize areas with critical
+facilities and to examine how rainfall in other watersheds influences flood
+exposure in one area. The MaxFloodCast model enables accurate and interpretable
+inundation depth predictions while significantly reducing computational time,
+thereby supporting emergency response efforts and flood risk management more
+effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Sizing and Training of Efficient Deep Autoencoders using
+  Second Order Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanishka Tyagi, Chinmay Rane, Michael Manry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a multi-step training method for designing generalized linear
+classifiers. First, an initial multi-class linear classifier is found through
+regression. Then validation error is minimized by pruning of unnecessary
+inputs. Simultaneously, desired outputs are improved via a method similar to
+the Ho-Kashyap rule. Next, the output discriminants are scaled to be net
+functions of sigmoidal output units in a generalized linear classifier. We then
+develop a family of batch training algorithm for the multi layer perceptron
+that optimizes its hidden layer size and number of training epochs. Next, we
+combine pruning with a growing approach. Later, the input units are scaled to
+be the net function of the sigmoidal output units that are then feed into as
+input to the MLP. We then propose resulting improvements in each of the deep
+learning blocks thereby improving the overall performance of the deep
+architecture. We discuss the principles and formulation regarding learning
+algorithms for deep autoencoders. We investigate several problems in deep
+autoencoders networks including training issues, the theoretical, mathematical
+and experimental justification that the networks are linear, optimizing the
+number of hidden units in each layer and determining the depth of the deep
+learning model. A direct implication of the current work is the ability to
+construct fast deep learning models using desktop level computational
+resources. This, in our opinion, promotes our design philosophy of building
+small but powerful algorithms. Performance gains are demonstrated at each step.
+Using widely available datasets, the final network's ten fold testing error is
+shown to be less than that of several other linear, generalized linear
+classifiers, multi layer perceptron and deep learners reported in the
+literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Change Point Detection With Conceptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noah D. Gade, Jordan Rodu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline change point detection seeks to identify points in a time series
+where the data generating process changes. This problem is well studied for
+univariate i.i.d. data, but becomes challenging with increasing dimension and
+temporal dependence. For the at most one change point problem, we propose the
+use of a conceptor matrix to learn the characteristic dynamics of a specified
+training window in a time series. The associated random recurrent neural
+network acts as a featurizer of the data, and change points are identified from
+a univariate quantification of the distance between the featurization and the
+space spanned by a representative conceptor matrix. This model agnostic method
+can suggest potential locations of interest that warrant further study. We
+prove that, under mild assumptions, the method provides a consistent estimate
+of the true change point, and quantile estimates for statistics are produced
+via a moving block bootstrap of the original data. The method is tested on
+simulations from several classes of processes, and we evaluate performance with
+clustering metrics, graphical methods, and observed Type 1 error control. We
+apply our method to publicly available neural data from rats experiencing bouts
+of non-REM sleep prior to exploration of a radial maze.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main Text 30 pages, 9 figures; Supplementary Material 29 pages, 2
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety in Traffic Management Systems: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlu Du, Ankan Dash, Jing Li, Hua Wei, Guiling Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic management systems play a vital role in ensuring safe and efficient
+transportation on roads. However, the use of advanced technologies in traffic
+management systems has introduced new safety challenges. Therefore, it is
+important to ensure the safety of these systems to prevent accidents and
+minimize their impact on road users. In this survey, we provide a comprehensive
+review of the literature on safety in traffic management systems. Specifically,
+we discuss the different safety issues that arise in traffic management
+systems, the current state of research on safety in these systems, and the
+techniques and methods proposed to ensure the safety of these systems. We also
+identify the limitations of the existing research and suggest future research
+directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MDPI Designs journal, the Special Issue Design and
+  Application of Intelligent Transportation Systems. 30 pages, 6 figures,
+  published on 10 August 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Causal Probabilistic Framework for Prediction,
+  Action-Selection & Explanations for Robot Block-Stacking Tasks <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Cannizzaro, Jonathan Routley, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainties in the real world mean that is impossible for system designers
+to anticipate and explicitly design for all scenarios that a robot might
+encounter. Thus, robots designed like this are fragile and fail outside of
+highly-controlled environments. Causal models provide a principled framework to
+encode formal knowledge of the causal relationships that govern the robot's
+interaction with its environment, in addition to probabilistic representations
+of noise and uncertainty typically encountered by real-world robots. Combined
+with causal inference, these models permit an autonomous agent to understand,
+reason about, and explain its environment. In this work, we focus on the
+problem of a robot block-stacking task due to the fundamental perception and
+manipulation capabilities it demonstrates, required by many applications
+including warehouse logistics and domestic human support robotics. We propose a
+novel causal probabilistic framework to embed a physics simulation capability
+into a structural causal model to permit robots to perceive and assess the
+current state of a block-stacking task, reason about the next-best action from
+placement candidates, and generate post-hoc counterfactual explanations. We
+provide exemplar next-best action selection results and outline planned
+experimentation in simulated and real-world robot block-stacking tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, 3 figures, accepted to the "Causality for Robotics:
+  Answering the Question of Why" workshop at the 2023 IEEE/RSJ International
+  Conference on Intelligent Robots and Systems (IROS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Predicate Visual Context in Detecting of Human-Object
+  Interactions <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederic Z. Zhang, Yuhui Yuan, Dylan Campbell, Zhuoyao Zhong, Stephen Gould
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the DETR framework has emerged as the dominant approach for
+human--object interaction (HOI) research. In particular, two-stage
+transformer-based HOI detectors are amongst the most performant and
+training-efficient approaches. However, these often condition HOI
+classification on object features that lack fine-grained contextual
+information, eschewing pose and orientation information in favour of visual
+cues about object identity and box extremities. This naturally hinders the
+recognition of complex or ambiguous interactions. In this work, we study these
+issues through visualisations and carefully designed experiments. Accordingly,
+we investigate how best to re-introduce image features via cross-attention.
+With an improved query design, extensive exploration of keys and values, and
+box pair positional embeddings as spatial guidance, our model with enhanced
+predicate visual context (PViC) outperforms state-of-the-art methods on the
+HICO-DET and V-COCO benchmarks, while maintaining low training cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Complex Facial Expression Recognition Using Deep Knowledge Distillation
+  of Basic Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angus Maiden, Bahareh Nakisa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex emotion recognition is a cognitive task that has so far eluded the
+same excellent performance of other tasks that are at or above the level of
+human cognition. Emotion recognition through facial expressions is particularly
+difficult due to the complexity of emotions expressed by the human face. For a
+machine to approach the same level of performance in this domain as a human, it
+may need to synthesise knowledge and understand new concepts in real-time as
+humans do. Humans are able to learn new concepts using only few examples, by
+distilling the important information from memories and discarding the rest.
+Similarly, continual learning methods learn new classes whilst retaining the
+knowledge of known classes, whilst few-shot learning methods are able to learn
+new classes using very few training examples. We propose a novel continual
+learning method inspired by human cognition and learning that can accurately
+recognise new compound expression classes using few training samples, by
+building on and retaining its knowledge of basic expression classes. Using
+GradCAM visualisations, we demonstrate the relationship between basic and
+compound facial expressions, which our method leverages through knowledge
+distillation and a novel Predictive Sorting Memory Replay. Our method achieves
+the current state-of-the-art in continual learning for complex facial
+expression recognition with 74.28% Overall Accuracy on new classes. We also
+demonstrate that using continual learning for complex facial expression
+recognition achieves far better performance than non-continual learning
+methods, improving on state-of-the-art non-continual learning methods by
+13.95%. To the best of our knowledge, our work is also the first to apply
+few-shot learning to complex facial expression recognition, achieving the
+state-of-the-art with 100% accuracy using a single training sample for each
+expression class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures, 6 tables. Code available at
+  https://github.com/AngusMaiden/complex-FER</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Guest Nationality Composition from Hotel <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Gröger, Marc Pouly, Flavia Tinner, Leif Brandes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many hotels target guest acquisition efforts to specific markets in order to
+best anticipate individual preferences and needs of their guests. Likewise,
+such strategic positioning is a prerequisite for efficient marketing budget
+allocation. Official statistics report on the number of visitors from different
+countries, but no fine-grained information on the guest composition of
+individual businesses exists. There is, however, growing interest in such data
+from competitors, suppliers, researchers and the general public. We demonstrate
+how machine learning can be leveraged to extract references to guest
+nationalities from unstructured text reviews in order to dynamically assess and
+monitor the dynamics of guest composition of individual businesses. In
+particular, we show that a rather simple architecture of pre-trained embeddings
+and stacked LSTM layers provides a better performance-runtime tradeoff than
+more complex state-of-the-art language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physical Adversarial Attacks For Camera-based Smart Systems: Current
+  Trends, Categorization, Applications, Research Challenges, and Future Outlook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amira Guesmi, Muhammad Abdullah Hanif, Bassem Ouni, Muhammed Shafique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a comprehensive survey of the current trends
+focusing specifically on physical adversarial attacks. We aim to provide a
+thorough understanding of the concept of physical adversarial attacks,
+analyzing their key characteristics and distinguishing features. Furthermore,
+we explore the specific requirements and challenges associated with executing
+attacks in the physical world. Our article delves into various physical
+adversarial attack methods, categorized according to their target tasks in
+different applications, including classification, detection, face recognition,
+semantic segmentation and depth estimation. We assess the performance of these
+attack methods in terms of their effectiveness, stealthiness, and robustness.
+We examine how each technique strives to ensure the successful manipulation of
+DNNs while mitigating the risk of detection and withstanding real-world
+distortions. Lastly, we discuss the current challenges and outline potential
+future research directions in the field of physical adversarial attacks. We
+highlight the need for enhanced defense mechanisms, the exploration of novel
+attack strategies, the evaluation of attacks in different application domains,
+and the establishment of standardized benchmarks and evaluation criteria for
+physical adversarial attacks. Through this comprehensive survey, we aim to
+provide a valuable resource for researchers, practitioners, and policymakers to
+gain a holistic understanding of physical adversarial attacks in computer
+vision and facilitate the development of robust and secure DNN-based systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phased Deep Spatio-temporal Learning for Highway Traffic Volume
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilong Ding, Tianpu Zhang, Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inter-city highway transportation is significant for citizens' modern urban
+life and generates heterogeneous sensory data with spatio-temporal
+characteristics. As a routine analysis in transportation domain, daily traffic
+volume estimation faces challenges for highway toll stations including lacking
+of exploration of correlative spatio-temporal features from a long-term
+perspective and effective means to deal with data imbalance which always
+deteriorates the predictive performance. In this paper, a deep spatio-temporal
+learning method is proposed to predict daily traffic volume in three phases. In
+feature pre-processing phase, data is normalized elaborately according to
+latent long-tail distribution. In spatio-temporal learning phase, a hybrid
+model is employed combining fully convolution network (FCN) and long short-term
+memory (LSTM), which considers time, space, meteorology, and calendar from
+heterogeneous data. In decision phase, traffic volumes on a coming day at
+network-wide toll stations would be achieved effectively, which is especially
+calibrated for vital few highway stations. Using real-world data from one
+Chinese provincial highway, extensive experiments show our method has distinct
+improvement for predictive accuracy than various traditional models, reaching
+5.269 and 0.997 in MPAE and R-squre metrics, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Process Regression for Maximum Entropy Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohsen Sadr, Manuel Torrilhon, M. Hossein Gorji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximum-Entropy Distributions offer an attractive family of probability
+densities suitable for moment closure problems. Yet finding the Lagrange
+multipliers which parametrize these distributions, turns out to be a
+computational bottleneck for practical closure settings. Motivated by recent
+success of Gaussian processes, we investigate the suitability of Gaussian
+priors to approximate the Lagrange multipliers as a map of a given set of
+moments. Examining various kernel functions, the hyperparameters are optimized
+by maximizing the log-likelihood. The performance of the devised data-driven
+Maximum-Entropy closure is studied for couple of test cases including
+relaxation of non-equilibrium distributions governed by Bhatnagar-Gross-Krook
+and Boltzmann kinetic equations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identification of the Relevance of Comments in Codes Using Bag of Words
+  and <span class="highlight-title">Transformer</span> Based Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sruthi S, Tanmay Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Forum for Information Retrieval (FIRE) started a shared task this year
+for classification of comments of different code segments. This is binary text
+classification task where the objective is to identify whether comments given
+for certain code segments are relevant or not. The BioNLP-IISERB group at the
+Indian Institute of Science Education and Research Bhopal (IISERB) participated
+in this task and submitted five runs for five different models. The paper
+presents the overview of the models and other significant findings on the
+training corpus. The methods involve different feature engineering schemes and
+text classification techniques. The performance of the classical bag of words
+model and transformer-based models were explored to identify significant
+features from the given training corpus. We have explored different classifiers
+viz., random forest, support vector machine and logistic regression using the
+bag of words model. Furthermore, the pre-trained transformer based models like
+BERT, RoBERT and ALBERT were also used by fine-tuning them on the given
+training corpus. The performance of different such models over the training
+corpus were reported and the best five models were implemented on the given
+test corpus. The empirical results show that the bag of words model outperforms
+the transformer based models, however, the performance of our runs are not
+reasonably well in both training and test corpus. This paper also addresses the
+limitations of the models and scope for further improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging
+  Handwritten Documents using Deep Feature Learning from JPEG Coefficients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bulla Rajesh, Sk Mahafuz Zaman, Mohammed Javed, P. Nagabhushan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic localization of text-lines in handwritten documents is still an
+open and challenging research problem. Various writing issues such as uneven
+spacing between the lines, oscillating and touching text, and the presence of
+skew become much more challenging when the case of complex handwritten document
+images are considered for segmentation directly in their respective compressed
+representation. This is because, the conventional way of processing compressed
+documents is through decompression, but here in this paper, we propose an idea
+that employs deep feature learning directly from the JPEG compressed
+coefficients without full decompression to accomplish text-line localization in
+the JPEG compressed domain. A modified U-Net architecture known as Compressed
+Text-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The
+model is trained and tested with JPEG compressed version of benchmark datasets
+including ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art
+performance with reduced storage and computational costs in the JPEG compressed
+domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),
+  5-8 November 2023, Kitakyushu, Japan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Artificial Neural Networks for Investigation of Pressure
+  Filtration Performance, a Zinc Leaching Filter Cake Moisture Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoume Kazemi, Davood Moradkhani, Alireza A. Alipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) is a powerful tool for material science applications.
+Artificial Neural Network (ANN) is a machine learning technique that can
+provide high prediction accuracy. This study aimed to develop an ANN model to
+predict the cake moisture of the pressure filtration process of zinc
+production. The cake moisture was influenced by seven parameters: temperature
+(35 and 65 Celsius), solid concentration (0.2 and 0.38 g/L), pH (2, 3.5, and
+5), air-blow time (2, 10, and 15 min), cake thickness (14, 20, 26, and 34 mm),
+pressure, and filtration time. The study conducted 288 tests using two types of
+fabrics: polypropylene (S1) and polyester (S2). The ANN model was evaluated by
+the Coefficient of determination (R2), the Mean Square Error (MSE), and the
+Mean Absolute Error (MAE) metrics for both datasets. The results showed R2
+values of 0.88 and 0.83, MSE values of 6.243x10-07 and 1.086x10-06, and MAE
+values of 0.00056 and 0.00088 for S1 and S2, respectively. These results
+indicated that the ANN model could predict the cake moisture of pressure
+filtration in the zinc leaching process with high accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PDE Discovery for Soft Sensors Using Coupled Physics-Informed Neural
+  Network with Akaike's Information Criterion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aina Wang, Pan Qin, Xi-Ming Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soft sensors have been extensively used to monitor key variables using
+easy-to-measure variables and mathematical models. Partial differential
+equations (PDEs) are model candidates for soft sensors in industrial processes
+with spatiotemporal dependence. However, gaps often exist between idealized
+PDEs and practical situations. Discovering proper structures of PDEs, including
+the differential operators and source terms, can remedy the gaps. To this end,
+a coupled physics-informed neural network with Akaike's criterion information
+(CPINN-AIC) is proposed for PDE discovery of soft sensors. First, CPINN is
+adopted for obtaining solutions and source terms satisfying PDEs. Then, we
+propose a data-physics-hybrid loss function for training CPINN, in which
+undetermined combinations of differential operators are involved. Consequently,
+AIC is used to discover the proper combination of differential operators.
+Finally, the artificial and practical datasets are used to verify the
+feasibility and effectiveness of CPINN-AIC for soft sensors. The proposed
+CPINN-AIC is a data-driven method to discover proper PDE structures and neural
+network-based solutions for soft sensors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification for Image-based Traffic Prediction across
+  Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Timans, Nina Wiedemann, Nishant Kumar, Ye Hong, Martin Raubal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the strong predictive performance of deep learning models for traffic
+prediction, their widespread deployment in real-world intelligent
+transportation systems has been restrained by a lack of interpretability.
+Uncertainty quantification (UQ) methods provide an approach to induce
+probabilistic reasoning, improve decision-making and enhance model deployment
+potential. To gain a comprehensive picture of the usefulness of existing UQ
+methods for traffic prediction and the relation between obtained uncertainties
+and city-wide traffic dynamics, we investigate their application to a
+large-scale image-based traffic dataset spanning multiple cities and time
+periods. We compare two epistemic and two aleatoric UQ methods on both temporal
+and spatio-temporal transfer tasks, and find that meaningful uncertainty
+estimates can be recovered. We further demonstrate how uncertainty estimates
+can be employed for unsupervised outlier detection on changes in city traffic
+dynamics. We find that our approach can capture both temporal and spatial
+effects on traffic behaviour in a representative case study for the city of
+Moscow. Our work presents a further step towards boosting uncertainty awareness
+in traffic prediction tasks, and aims to highlight the value contribution of UQ
+methods to a better understanding of city traffic dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 22 figures. Code publicly available at:
+  https://github.com/alextimans/traffic4cast-uncertainty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Control Policies for Variable Objectives from Offline Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Weber, Phillip Swazinna, Daniel Hein, Steffen Udluft, Volkmar Sterzing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning provides a viable approach to obtain advanced
+control strategies for dynamical systems, in particular when direct interaction
+with the environment is not available. In this paper, we introduce a conceptual
+extension for model-based policy search methods, called variable objective
+policy (VOP). With this approach, policies are trained to generalize
+efficiently over a variety of objectives, which parameterize the reward
+function. We demonstrate that by altering the objectives passed as input to the
+policy, users gain the freedom to adjust its behavior or re-balance
+optimization targets at runtime, without need for collecting additional
+observation batches or re-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hawkes Processes with Delayed Granger Causality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Yang, Hengyuan Miao, Shuang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to explicitly model the delayed Granger causal effects based on
+multivariate Hawkes processes. The idea is inspired by the fact that a causal
+event usually takes some time to exert an effect. Studying this time lag itself
+is of interest. Given the proposed model, we first prove the identifiability of
+the delay parameter under mild conditions. We further investigate a model
+estimation method under a complex setting, where we want to infer the posterior
+distribution of the time lags and understand how this distribution varies
+across different scenarios. We treat the time lags as latent variables and
+formulate a Variational Auto-Encoder (VAE) algorithm to approximate the
+posterior distribution of the time lags. By explicitly modeling the time lags
+in Hawkes processes, we add flexibility to the model. The inferred time-lag
+posterior distributions are of scientific meaning and help trace the original
+causal time that supports the root cause analysis. We empirically evaluate our
+model's event prediction and time-lag inference accuracy on synthetic and real
+data, achieving promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composable Function-preserving Expansions for <span class="highlight-title">Transformer</span> Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Gesmundo, Kaitlin Maile
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training state-of-the-art neural networks requires a high cost in terms of
+compute and time. Model scale is recognized to be a critical factor to achieve
+and improve the state-of-the-art. Increasing the scale of a neural network
+normally requires restarting from scratch by randomly initializing all the
+parameters of the model, as this implies a change of architecture's parameters
+that does not allow for a straightforward transfer of knowledge from smaller
+size models. In this work, we propose six composable transformations to
+incrementally increase the size of transformer-based neural networks while
+preserving functionality, allowing to expand the capacity of the model as
+needed. We provide proof of exact function preservation under minimal
+initialization constraints for each transformation. The proposed methods may
+enable efficient training pipelines for larger and more powerful models by
+progressively expanding the architecture throughout training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-based Visual Counterfactual Explanations -- Towards Systematic
+  Quantitative Evaluation <span class="chip">ECML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Vaeth, Alexander M. Fruehwald, Benjamin Paassen, Magda Gregorova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latest methods for visual counterfactual explanations (VCE) harness the power
+of deep generative models to synthesize new examples of high-dimensional images
+of impressive quality. However, it is currently difficult to compare the
+performance of these VCE methods as the evaluation procedures largely vary and
+often boil down to visual inspection of individual examples and small scale
+user studies. In this work, we propose a framework for systematic, quantitative
+evaluation of the VCE methods and a minimal set of metrics to be used. We use
+this framework to explore the effects of certain crucial design choices in the
+latest diffusion-based generative models for VCEs of natural image
+classification (ImageNet). We conduct a battery of ablation-like experiments,
+generating thousands of VCEs for a suite of classifiers of various complexity,
+accuracy and robustness. Our findings suggest multiple directions for future
+advancements and improvements of VCE methods. By sharing our methodology and
+our approach to tackle the computational challenges of such a study on a
+limited hardware setup (including the complete code base), we offer a valuable
+guidance for researchers in the field fostering consistency and transparency in
+the assessment of counterfactual explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 5th International Workshop on eXplainable Knowledge
+  Discovery in Data Mining @ ECML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Conversation Models and How to Rein Them in: A <span class="highlight-title">Survey</span> of Failures
+  and Fixes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Galetzka, Anne Beyer, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent conditional language models are able to continue any kind of text
+source in an often seemingly fluent way. This fact encouraged research in the
+area of open-domain conversational systems that are based on powerful language
+models and aim to imitate an interlocutor by generating appropriate
+contributions to a written dialogue. From a linguistic perspective, however,
+the complexity of contributing to a conversation is high. In this survey, we
+interpret Grice's maxims of cooperative conversation from the perspective of
+this specific research area and systematize the literature under the aspect of
+what makes a contribution appropriate: A neural conversation model has to be
+fluent, informative, consistent, coherent, and follow social norms. In order to
+ensure these qualities, recent approaches try to tame the underlying language
+models at various intervention points, such as data, training regime or
+decoding. Sorted by these categories and intervention points, we discuss
+promising attempts and suggest novel ways for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Represents the state of the field in 2022; partially based on the
+  first authors 2022 PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Logic Rule Learning for Temporal Point Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Yang, Lu Wang, Kun Gao, Shuang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a framework that can incrementally expand the explanatory temporal
+logic rule set to explain the occurrence of temporal events. Leveraging the
+temporal point process modeling and learning framework, the rule content and
+weights will be gradually optimized until the likelihood of the observational
+event sequences is optimal. The proposed algorithm alternates between a master
+problem, where the current rule set weights are updated, and a subproblem,
+where a new rule is searched and included to best increase the likelihood. The
+formulated master problem is convex and relatively easy to solve using
+continuous optimization, whereas the subproblem requires searching the huge
+combinatorial rule predicate and relationship space. To tackle this challenge,
+we propose a neural search policy to learn to generate the new rule content as
+a sequence of actions. The policy parameters will be trained end-to-end using
+the reinforcement learning framework, where the reward signals can be
+efficiently queried by evaluating the subproblem objective. The trained policy
+can be used to generate new rules in a controllable way. We evaluate our
+methods on both synthetic and real healthcare datasets, obtaining promising
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Experts Weights Averaging: A New General Training Scheme for Vision
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Huang, Peng Ye, Xiaoshui Huang, Sheng Li, Tao Chen, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural re-parameterization is a general training scheme for Convolutional
+Neural Networks (CNNs), which achieves performance improvement without
+increasing inference cost. As Vision Transformers (ViTs) are gradually
+surpassing CNNs in various visual tasks, one may question: if a training scheme
+specifically for ViTs exists that can also achieve performance improvement
+without increasing inference cost? Recently, Mixture-of-Experts (MoE) has
+attracted increasing attention, as it can efficiently scale up the capacity of
+Transformers at a fixed cost through sparsely activated experts. Considering
+that MoE can also be viewed as a multi-branch structure, can we utilize MoE to
+implement a ViT training scheme similar to structural re-parameterization? In
+this paper, we affirmatively answer these questions, with a new general
+training strategy for ViTs. Specifically, we decouple the training and
+inference phases of ViTs. During training, we replace some Feed-Forward
+Networks (FFNs) of the ViT with specially designed, more efficient MoEs that
+assign tokens to experts by random uniform partition, and perform Experts
+Weights Averaging (EWA) on these MoEs at the end of each iteration. After
+training, we convert each MoE into an FFN by averaging the experts,
+transforming the model back into original ViT for inference. We further provide
+a theoretical analysis to show why and how it works. Comprehensive experiments
+across various 2D and 3D visual tasks, ViT architectures, and datasets validate
+the effectiveness and generalizability of the proposed training scheme.
+Besides, our training scheme can also be applied to improve performance when
+fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can
+significantly improve the effectiveness of naive MoE in various 2D visual small
+datasets and 3D visual tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward a Better Understanding of Loss Functions for Collaborative
+  Filtering <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongmin Park, Mincheol Yoon, Jae-woong Lee, Hogun Park, Jongwuk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative filtering (CF) is a pivotal technique in modern recommender
+systems. The learning process of CF models typically consists of three
+components: interaction encoder, loss function, and negative sampling. Although
+many existing studies have proposed various CF models to design sophisticated
+interaction encoders, recent work shows that simply reformulating the loss
+functions can achieve significant performance gains. This paper delves into
+analyzing the relationship among existing loss functions. Our mathematical
+analysis reveals that the previous loss functions can be interpreted as
+alignment and uniformity functions: (i) the alignment matches user and item
+representations, and (ii) the uniformity disperses user and item distributions.
+Inspired by this analysis, we propose a novel loss function that improves the
+design of alignment and uniformity considering the unique patterns of datasets
+called Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty
+of MAWU is two-fold: (i) margin-aware alignment (MA) mitigates
+user/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts
+the significance between user and item uniformities to reflect the inherent
+characteristics of datasets. Extensive experimental results show that MF and
+LightGCN equipped with MAWU are comparable or superior to state-of-the-art CF
+models with various loss functions on three public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safeguarding Learning-based Control for Smart Energy Systems with
+  Sampling Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Hong Cheng, Venkatesh Prasad Venkataramanan, Pragya Kirti Gupta, Yun-Fei Hsu, Simon Burton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study challenges using reinforcement learning in controlling energy
+systems, where apart from performance requirements, one has additional safety
+requirements such as avoiding blackouts. We detail how these safety
+requirements in real-time temporal logic can be strengthened via discretization
+into linear temporal logic (LTL), such that the satisfaction of the LTL
+formulae implies the satisfaction of the original safety requirements. The
+discretization enables advanced engineering methods such as synthesizing
+shields for safe reinforcement learning as well as formal verification, where
+for statistical model checking, the probabilistic guarantee acquired by LTL
+model checking forms a lower bound for the satisfaction of the original
+real-time safety requirements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence
+  and Variance Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowen Jiang, Sebastian U. Stich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed stochastic Polyak stepsize (SPS) and stochastic
+line-search (SLS) for SGD have shown remarkable effectiveness when training
+over-parameterized models. However, in non-interpolation settings, both
+algorithms only guarantee convergence to a neighborhood of a solution which may
+result in a worse output than the initial guess. While artificially decreasing
+the adaptive stepsize has been proposed to address this issue (Orvieto et al.
+[2022]), this approach results in slower convergence rates for convex and
+over-parameterized models. In this work, we make two contributions: Firstly, we
+propose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which
+guarantee convergence in non-interpolation settings and maintain sub-linear and
+linear convergence rates for convex and strongly convex functions when training
+over-parameterized models. AdaSLS requires no knowledge of problem-dependent
+parameters, and AdaSPS requires only a lower bound of the optimal function
+value as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance
+reduction technique and obtain algorithms that require
+$\smash{\widetilde{\mathcal{O}}}(n+1/\epsilon)$ gradient evaluations to achieve
+an $\mathcal{O}(\epsilon)$-suboptimality for convex functions, which improves
+upon the slower $\mathcal{O}(1/\epsilon^2)$ rates of AdaSPS and AdaSLS without
+variance reduction in the non-interpolation regimes. Moreover, our result
+matches the fast rates of AdaSVRG but removes the inner-outer-loop structure,
+which is easier to implement and analyze. Finally, numerical experiments on
+synthetic and real datasets validate our theory and demonstrate the
+effectiveness and robustness of our algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cost-effective On-device Continual Learning over Memory Hierarchy with
+  Miro 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Ma, Suyeon Jeong, Minjia Zhang, Di Wang, Jonghyun Choi, Myeongjae Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) trains NN models incrementally from a continuous
+stream of tasks. To remember previously learned knowledge, prior studies store
+old samples over a memory hierarchy and replay them when new tasks arrive. Edge
+devices that adopt CL to preserve data privacy are typically energy-sensitive
+and thus require high model accuracy while not compromising energy efficiency,
+i.e., cost-effectiveness. Our work is the first to explore the design space of
+hierarchical memory replay-based CL to gain insights into achieving
+cost-effectiveness on edge devices. We present Miro, a novel system runtime
+that carefully integrates our insights into the CL framework by enabling it to
+dynamically configure the CL system based on resource states for the best
+cost-effectiveness. To reach this goal, Miro also performs online profiling on
+parameters with clear accuracy-energy trade-offs and adapts to optimal values
+with low overhead. Extensive evaluations show that Miro significantly
+outperforms baseline systems we build for comparison, consistently achieving
+higher cost-effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is submitted for publication to MobiCom 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Instance-adaptive Inference for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Mei Feng, Kai Yu, Nian Liu, Xinxing Xu, Salman Khan, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed learning paradigm that enables
+multiple clients to learn a powerful global model by aggregating local
+training. However, the performance of the global model is often hampered by
+non-i.i.d. distribution among the clients, requiring extensive efforts to
+mitigate inter-client data heterogeneity. Going beyond inter-client data
+heterogeneity, we note that intra-client heterogeneity can also be observed on
+complex real-world data and seriously deteriorate FL performance. In this
+paper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client
+data heterogeneity by enabling instance-adaptive inference in the FL framework.
+Instead of huge instance-adaptive models, we resort to a parameter-efficient
+fine-tuning method, i.e., scale and shift deep features (SSF), upon a
+pre-trained model. Specifically, we first train an SSF pool for each client,
+and aggregate these SSF pools on the server side, thus still maintaining a low
+communication cost. To enable instance-adaptive inference, for a given
+instance, we dynamically find the best-matched SSF subsets from the pool and
+aggregate them to generate an adaptive SSF specified for the instance, thereby
+reducing the intra-client as well as the inter-client heterogeneity. Extensive
+experiments show that our FedIns outperforms state-of-the-art FL algorithms,
+e.g., a 6.64\% improvement against the top-performing method with less than
+15\% communication cost on Tiny-ImageNet. Our code and models will be publicly
+released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling Character Motions without Observable Driving Source 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyuan Li, Bin Dai, Ziyi Zhou, Qi Yao, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to generate diverse, life-like, and unlimited long head/body sequences
+without any driving source? We argue that this under-investigated research
+problem is non-trivial at all, and has unique technical challenges behind it.
+Without semantic constraints from the driving sources, using the standard
+autoregressive model to generate infinitely long sequences would easily result
+in 1) out-of-distribution (OOD) issue due to the accumulated error, 2)
+insufficient diversity to produce natural and life-like motion sequences and 3)
+undesired periodic patterns along the time. To tackle the above challenges, we
+propose a systematic framework that marries the benefits of VQ-VAE and a novel
+token-level control policy trained with reinforcement learning using carefully
+designed reward functions. A high-level prior model can be easily injected on
+top to generate unlimited long and diverse sequences. Although we focus on no
+driving sources now, our framework can be generalized for controlled synthesis
+with explicit driving sources. Through comprehensive evaluations, we conclude
+that our proposed framework can address all the above-mentioned challenges and
+outperform other strong baselines very significantly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Telecom: Forthcoming Impact on the Industry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Maatouk, Nicola Piovesan, Fadhel Ayed, Antonio De Domenico, Merouane Debbah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as a transformative force,
+revolutionizing numerous fields well beyond the conventional domain of Natural
+Language Processing (NLP) and garnering unprecedented attention. As LLM
+technology continues to progress, the telecom industry is facing the prospect
+of its potential impact on its landscape. To elucidate these implications, we
+delve into the inner workings of LLMs, providing insights into their current
+capabilities and limitations. We also examine the use cases that can be readily
+implemented in the telecom industry, streamlining numerous tasks that currently
+hinder operational efficiency and demand significant manpower and engineering
+expertise. Furthermore, we uncover essential research directions that deal with
+the distinctive challenges of utilizing the LLMs within the telecom domain.
+Addressing these challenges represents a significant stride towards fully
+harnessing the potential of LLMs and unlocking their capabilities to the
+fullest extent within the telecom domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does AI for science need another ImageNet Or totally different
+  benchmarks? A case study of machine learning force fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatao Li, Wanling Gao, Lei Wang, Lixin Sun, Zun Wang, Jianfeng Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI for science (AI4S) is an emerging research field that aims to enhance the
+accuracy and speed of scientific computing tasks using machine learning
+methods. Traditional AI benchmarking methods struggle to adapt to the unique
+challenges posed by AI4S because they assume data in training, testing, and
+future real-world queries are independent and identically distributed, while
+AI4S workloads anticipate out-of-distribution problem instances. This paper
+investigates the need for a novel approach to effectively benchmark AI for
+science, using the machine learning force field (MLFF) as a case study. MLFF is
+a method to accelerate molecular dynamics (MD) simulation with low
+computational cost and high accuracy. We identify various missed opportunities
+in scientifically meaningful benchmarking and propose solutions to evaluate
+MLFF models, specifically in the aspects of sample efficiency, time domain
+sensitivity, and cross-dataset generalization capabilities. By setting up the
+problem instantiation similar to the actual scientific applications, more
+meaningful performance metrics from the benchmark can be achieved. This suite
+of metrics has demonstrated a better ability to assess a model's performance in
+real-world scientific applications, in contrast to traditional AI benchmarking
+methodologies. This work is a component of the SAIBench project, an AI4S
+benchmarking suite. The project homepage is
+https://www.computercouncil.org/SAIBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast and Accurate Transferability Measurement by Evaluating Intra-class
+  Feature Variance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiwen Xu, U Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a set of pre-trained models, how can we quickly and accurately find the
+most useful pre-trained model for a downstream task? Transferability
+measurement is to quantify how transferable is a pre-trained model learned on a
+source task to a target task. It is used for quickly ranking pre-trained models
+for a given task and thus becomes a crucial step for transfer learning.
+Existing methods measure transferability as the discrimination ability of a
+source model for a target data before transfer learning, which cannot
+accurately estimate the fine-tuning performance. Some of them restrict the
+application of transferability measurement in selecting the best supervised
+pre-trained models that have classifiers. It is important to have a general
+method for measuring transferability that can be applied in a variety of
+situations, such as selecting the best self-supervised pre-trained models that
+do not have classifiers, and selecting the best transferring layer for a target
+task. In this work, we propose TMI (TRANSFERABILITY MEASUREMENT WITH
+INTRA-CLASS FEATURE VARIANCE), a fast and accurate algorithm to measure
+transferability. We view transferability as the generalization of a pre-trained
+model on a target task by measuring intra-class feature variance. Intra-class
+variance evaluates the adaptability of the model to a new task, which measures
+how transferable the model is. Compared to previous studies that estimate how
+discriminative the models are, intra-class variance is more accurate than those
+as it does not require an optimal feature extractor and classifier. Extensive
+experiments on real-world datasets show that TMI outperforms competitors for
+selecting the top-5 best models, and exhibits consistently better correlation
+in 13 out of 17 cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning nonparametric DAGs with incremental information via high-order
+  HSIC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yafei Wang, Jianguo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based methods for learning Bayesain networks(BN) aim to maximizing the
+global score functions. However, if local variables have direct and indirect
+dependence simultaneously, the global optimization on score functions misses
+edges between variables with indirect dependent relationship, of which scores
+are smaller than those with direct dependent relationship. In this paper, we
+present an identifiability condition based on a determined subset of parents to
+identify the underlying DAG. By the identifiability condition, we develop a
+two-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the
+global optimization. In the optimal phase, an optimization problem based on
+first-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated
+skeleton as the initial determined parents subset. In the tuning phase, the
+skeleton is locally tuned by deletion, addition and DAG-formalization
+strategies using the theoretically proved incremental properties of high-order
+HSIC. Numerical experiments for different synthetic datasets and real-world
+datasets show that the OT algorithm outperforms existing methods. Especially in
+Sigmoid Mix model with the size of the graph being ${\rm\bf d=40}$, the
+structure intervention distance (SID) of the OT algorithm is 329.7 smaller than
+the one obtained by CAM, which indicates that the graph estimated by the OT
+algorithm misses fewer edges compared with CAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Point Cloud Compression for Classification <span class="chip">SP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateen Ulhaq, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is increasingly being used to perform machine vision tasks such
+as classification, object detection, and segmentation on 3D point cloud data.
+However, deep learning inference is computationally expensive. The limited
+computational capabilities of end devices thus necessitate a codec for
+transmitting point cloud data over the network for server-side processing. Such
+a codec must be lightweight and capable of achieving high compression ratios
+without sacrificing accuracy. Motivated by this, we present a novel point cloud
+codec that is highly specialized for the machine task of classification. Our
+codec, based on PointNet, achieves a significantly better rate-accuracy
+trade-off in comparison to alternative methods. In particular, it achieves a
+94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40
+dataset. For low-resource end devices, we also propose two lightweight
+configurations of our encoder that achieve similar BD-bitrate reductions of 93%
+and 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and
+0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the
+potential of specialized codecs for machine analysis of point clouds, and
+provides a basis for extension to more complex tasks and datasets in the
+future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, IEEE MMSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Node Embedding for Homophilous Graphs with ARGEW: Augmentation of Random
+  walks by Graph Edge Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Hee Kim, Jaeman Son, Hyunsoo Kim, Eunjo Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing nodes in a network as dense vectors node embeddings is important
+for understanding a given network and solving many downstream tasks. In
+particular, for weighted homophilous graphs where similar nodes are connected
+with larger edge weights, we desire node embeddings where node pairs with
+strong weights have closer embeddings. Although random walk based node
+embedding methods like node2vec and node2vec+ do work for weighted networks via
+including edge weights in the walk transition probabilities, our experiments
+show that the embedding result does not adequately reflect edge weights. In
+this paper, we propose ARGEW (Augmentation of Random walks by Graph Edge
+Weights), a novel augmentation method for random walks that expands the corpus
+in such a way that nodes with larger edge weights end up with closer
+embeddings. ARGEW can work with any random walk based node embedding method,
+because it is independent of the random sampling strategy itself and works on
+top of the already-performed walks. With several real-world networks, we
+demonstrate that with ARGEW, compared to not using it, the desired pattern that
+node pairs with larger edge weights have closer embeddings is much clearer. We
+also examine ARGEW's performance in node classification: node2vec with ARGEW
+outperforms pure node2vec and is not sensitive to hyperparameters (i.e.
+consistently good). In fact, it achieves similarly good results as supervised
+GCN, even without any node feature or label information during training.
+Finally, we explain why ARGEW works consistently well by exploring the
+coappearance distributions using a synthetic graph with clear structural roles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ INR-Arch: A Dataflow Architecture and Compiler for Arbitrary-Order
+  Gradient Computations in Implicit Neural Representation Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Abi-Karam, Rishov Sarkar, Dejia Xu, Zhiwen Fan, Zhangyang Wang, Cong Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An increasing number of researchers are finding use for nth-order gradient
+computations for a wide variety of applications, including graphics,
+meta-learning (MAML), scientific computing, and most recently, implicit neural
+representations (INRs). Recent work shows that the gradient of an INR can be
+used to edit the data it represents directly without needing to convert it back
+to a discrete representation. However, given a function represented as a
+computation graph, traditional architectures face challenges in efficiently
+computing its nth-order gradient due to the higher demand for computing power
+and higher complexity in data movement. This makes it a promising target for
+FPGA acceleration. In this work, we introduce INR-Arch, a framework that
+transforms the computation graph of an nth-order gradient into a
+hardware-optimized dataflow architecture. We address this problem in two
+phases. First, we design a dataflow architecture that uses FIFO streams and an
+optimized computation kernel library, ensuring high memory efficiency and
+parallel computation. Second, we propose a compiler that extracts and optimizes
+computation graphs, automatically configures hardware parameters such as
+latency and stream depths to optimize throughput, while ensuring deadlock-free
+operation, and outputs High-Level Synthesis (HLS) code for FPGA implementation.
+We utilize INR editing as our benchmark, presenting results that demonstrate
+1.8-4.8x and 1.5-3.6x speedup compared to CPU and GPU baselines respectively.
+Furthermore, we obtain 3.1-8.9x and 1.7-4.3x lower memory usage, and 1.7-11.3x
+and 5.5-32.8x lower energy-delay product. Our framework will be made
+open-source and available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the equivalence of Occam algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaman Keinath-Esmail
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blumer et al. (1987, 1989) showed that any concept class that is learnable by
+Occam algorithms is PAC learnable. Board and Pitt (1990) showed a partial
+converse of this theorem: for concept classes that are closed under exception
+lists, any class that is PAC learnable is learnable by an Occam algorithm.
+However, their Occam algorithm outputs a hypothesis whose complexity is
+$\delta$-dependent, which is an important limitation. In this paper, we show
+that their partial converse applies to Occam algorithms with
+$\delta$-independent complexities as well. Thus, we provide a posteriori
+justification of various theoretical results and algorithm design methods which
+use the partial converse as a basis for their work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, submitted to Information and Computation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing the quality of neural network uncertainty estimates for
+  classification problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Ries, Joshua Michalenko, Tyler Ganter, Rashad Imad-Fayez Baiyasi, Jason Adams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional deep learning (DL) models are powerful classifiers, but many
+approaches do not provide uncertainties for their estimates. Uncertainty
+quantification (UQ) methods for DL models have received increased attention in
+the literature due to their usefulness in decision making, particularly for
+high-consequence decisions. However, there has been little research done on how
+to evaluate the quality of such methods. We use statistical methods of
+frequentist interval coverage and interval width to evaluate the quality of
+credible intervals, and expected calibration error to evaluate classification
+predicted confidence. These metrics are evaluated on Bayesian neural networks
+(BNN) fit using Markov Chain Monte Carlo (MCMC) and variational inference (VI),
+bootstrapped neural networks (NN), Deep Ensembles (DE), and Monte Carlo (MC)
+dropout. We apply these different UQ for DL methods to a hyperspectral image
+target detection problem and show the inconsistency of the different methods'
+results and the necessity of a UQ quality metric. To reconcile these
+differences and choose a UQ method that appropriately quantifies the
+uncertainty, we create a simulated data set with fully parameterized
+probability distribution for a two-class classification problem. The gold
+standard MCMC performs the best overall, and the bootstrapped NN is a close
+second, requiring the same computational expense as DE. Through this
+comparison, we demonstrate that, for a given data set, different models can
+produce uncertainty estimates of markedly different quality. This in turn
+points to a great need for principled assessment methods of UQ quality in DL
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Team-Based Navigation: A <span class="highlight-title">Review</span> of Deep Reinforcement
+  Learning Techniques for Multi-Agent Pathfinding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaehoon Chung, Jamil Fayyad, Younes Al Younes, Homayoun Najjaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent pathfinding (MAPF) is a critical field in many large-scale
+robotic applications, often being the fundamental step in multi-agent systems.
+The increasing complexity of MAPF in complex and crowded environments, however,
+critically diminishes the effectiveness of existing solutions. In contrast to
+other studies that have either presented a general overview of the recent
+advancements in MAPF or extensively reviewed Deep Reinforcement Learning (DRL)
+within multi-agent system settings independently, our work presented in this
+review paper focuses on highlighting the integration of DRL-based approaches in
+MAPF. Moreover, we aim to bridge the current gap in evaluating MAPF solutions
+by addressing the lack of unified evaluation metrics and providing
+comprehensive clarification on these metrics. Finally, our paper discusses the
+potential of model-based DRL as a promising future direction and provides its
+required foundational understanding to address current challenges in MAPF. Our
+objective is to assist readers in gaining insight into the current research
+direction, providing unified metrics for comparing different MAPF algorithms
+and expanding their knowledge of model-based DRL to address the existing
+challenges in MAPF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DF2: Distribution-Free Decision-Focused Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingkai Kong, Wenhao Mu, Jiaming Cui, Yuchen Zhuang, B. Aditya Prakash, Bo Dai, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-focused learning (DFL) has recently emerged as a powerful approach
+for predict-then-optimize problems by customizing a predictive model to a
+downstream optimization task. However, existing end-to-end DFL methods are
+hindered by three significant bottlenecks: model mismatch error, sample average
+approximation error, and gradient approximation error. Model mismatch error
+stems from the misalignment between the model's parameterized predictive
+distribution and the true probability distribution. Sample average
+approximation error arises when using finite samples to approximate the
+expected optimization objective. Gradient approximation error occurs as DFL
+relies on the KKT condition for exact gradient computation, while most methods
+approximate the gradient for backpropagation in non-convex objectives. In this
+paper, we present DF2 -- the first \textit{distribution-free} decision-focused
+learning method explicitly designed to address these three bottlenecks. Rather
+than depending on a task-specific forecaster that requires precise model
+assumptions, our method directly learns the expected optimization function
+during training. To efficiently learn the function in a data-driven manner, we
+devise an attention-based model architecture inspired by the distribution-based
+parameterization of the expected objective. Our method is, to the best of our
+knowledge, the first to address all three bottlenecks within a single model. We
+evaluate DF2 on a synthetic problem, a wind power bidding problem, and a
+non-convex vaccine distribution problem, demonstrating the effectiveness of
+DF2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Combining Machine Learning Classifiers for Stock Trading with Effective
+  Feature Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.13148v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.13148v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. K. M. Amanat Ullah, Fahim Imtiaz, Miftah Uddin Md Ihsan, Md. Golam Rabiul Alam, Mahbub Majumdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unpredictability and volatility of the stock market render it challenging
+to make a substantial profit using any generalised scheme. Many previous
+studies tried different techniques to build a machine learning model, which can
+make a significant profit in the US stock market by performing live trading.
+However, very few studies have focused on the importance of finding the best
+features for a particular trading period. Our top approach used the performance
+to narrow down the features from a total of 148 to about 30. Furthermore, the
+top 25 features were dynamically selected before each time training our machine
+learning model. It uses ensemble learning with four classifiers: Gaussian Naive
+Bayes, Decision Tree, Logistic Regression with L1 regularization, and
+Stochastic Gradient Descent, to decide whether to go long or short on a
+particular stock. Our best model performed daily trade between July 2011 and
+January 2019, generating 54.35% profit. Finally, our work showcased that
+mixtures of weighted classifiers perform better than any individual predictor
+of making trading decisions in the stock market.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detection and classification of vocal productions in large scale audio
+  recordings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillem Bonafos, Pierre Pudlo, Jean-Marc Freyermuth, Thierry Legou, Joël Fagot, Samuel Tronçon, Arnaud Rey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an automatic data processing pipeline to extract vocal productions
+from large-scale natural audio recordings and classify these vocal productions.
+The pipeline is based on a deep neural network and adresses both issues
+simultaneously. Though a series of computationel steps (windowing, creation of
+a noise class, data augmentation, re-sampling, transfer learning, Bayesian
+optimisation), it automatically trains a neural network without requiring a
+large sample of labeled data and important computing resources. Our end-to-end
+methodology can handle noisy recordings made under different recording
+conditions. We test it on two different natural audio data sets, one from a
+group of Guinea baboons recorded from a primate research center and one from
+human babies recorded at home. The pipeline trains a model on 72 and 77 minutes
+of labeled audio recordings, with an accuracy of 94.58% and 99.76%. It is then
+used to process 443 and 174 hours of natural continuous recordings and it
+creates two new databases of 38.8 and 35.2 hours, respectively. We discuss the
+strengths and limitations of this approach that can be applied to any massive
+audio recording.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RT-1: Robotics <span class="highlight-title">Transformer</span> for Real-World Control at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Brohan, Noah Brown, Justice Carbajal, Yevgen Chebotar, Joseph Dabis, Chelsea Finn, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Tomas Jackson, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Isabel Leal, Kuang-Huei Lee, Sergey Levine, Yao Lu, Utsav Malla, Deeksha Manjunath, Igor Mordatch, Ofir Nachum, Carolina Parada, Jodilyn Peralta, Emily Perez, Karl Pertsch, Jornell Quiambao, Kanishka Rao, Michael Ryoo, Grecia Salazar, Pannag Sanketi, Kevin Sayed, Jaspiar Singh, Sumedh Sontakke, Austin Stone, Clayton Tan, Huong Tran, Vincent Vanhoucke, Steve Vega, Quan Vuong, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Tianhe Yu, Brianna Zitkovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By transferring knowledge from large, diverse, task-agnostic datasets, modern
+machine learning models can solve specific downstream tasks either zero-shot or
+with small task-specific datasets to a high level of performance. While this
+capability has been demonstrated in other fields such as computer vision,
+natural language processing or speech recognition, it remains to be shown in
+robotics, where the generalization capabilities of the models are particularly
+critical due to the difficulty of collecting real-world robotic data. We argue
+that one of the keys to the success of such general robotic models lies with
+open-ended task-agnostic training, combined with high-capacity architectures
+that can absorb all of the diverse, robotic data. In this paper, we present a
+model class, dubbed Robotics Transformer, that exhibits promising scalable
+model properties. We verify our conclusions in a study of different model
+classes and their ability to generalize as a function of the data size, model
+size, and data diversity based on a large-scale data collection on real robots
+performing real-world tasks. The project's website and videos can be found at
+robotics-transformer1.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See website at robotics-transformer1.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inverse Kernel Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.05961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.05961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengrui Li, Anqi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art dimensionality reduction approaches largely rely on
+complicated optimization procedures. On the other hand, closed-form approaches
+requiring merely eigen-decomposition do not have enough sophistication and
+nonlinearity. In this paper, we propose a novel nonlinear dimensionality
+reduction method -- Inverse Kernel Decomposition (IKD) -- based on an
+eigen-decomposition of the sample covariance matrix of data. The method is
+inspired by Gaussian process latent variable models (GPLVMs) and has comparable
+performance with GPLVMs. To deal with very noisy data with weak correlations,
+we propose two solutions -- blockwise and geodesic -- to make use of locally
+correlated data points and provide better and numerically more stable latent
+estimations. We use synthetic datasets and four real-world datasets to show
+that IKD is a better dimensionality reduction method than other
+eigen-decomposition-based methods, and achieves comparable performance against
+optimization-based methods with faster running speeds. Open-source IKD
+implementation in Python can be accessed at this
+\url{https://github.com/JerrySoybean/ikd}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super
+  Learner Equation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04365v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04365v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Vowels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference is a crucial goal of science, enabling researchers to arrive
+at meaningful conclusions regarding the predictions of hypothetical
+interventions using observational data. Path models, Structural Equation Models
+(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to
+unambiguously specify assumptions regarding the causal structure underlying a
+phenomenon. Unlike DAGs, which make very few assumptions about the functional
+and parametric form, SEM assumes linearity. This can result in functional
+misspecification which prevents researchers from undertaking reliable effect
+size estimation. In contrast, we propose Super Learner Equation Modeling, a
+path modeling technique integrating machine learning Super Learner ensembles.
+We empirically demonstrate its ability to provide consistent and unbiased
+estimates of causal effects, its competitive performance for linear models when
+compared with SEM, and highlight its superiority over SEM when dealing with
+non-linear relationships. We provide open-source code, and a tutorial notebook
+with example usage, accentuating the easy-to-use nature of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A method for escaping limit cycles in training GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.03322v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.03322v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Keke, Yang Xinmin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper mainly conducts further research to alleviate the issue of limit
+cycling behavior in training generative adversarial networks (GANs) through the
+proposed predictive centripetal acceleration algorithm (PCAA). Specifically, we
+first derive the upper and lower bounds on the last-iterate convergence rates
+of PCAA for the general bilinear game, with the upper bound notably improving
+upon previous results. Then, we combine PCAA with the adaptive moment
+estimation algorithm (Adam) to propose PCAA-Adam, a practical approach for
+training GANs. Finally, we validate the effectiveness of the proposed algorithm
+through experiments conducted on bilinear games, multivariate Gaussian
+distributions, and the CelebA dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RANS-PINN based Simulation Surrogates for Predicting Turbulent Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06034v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06034v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shinjan Ghosh, Amit Chakraborty, Georgia Olympia Brikis, Biswadip Dey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) provide a framework to build
+surrogate models for dynamical systems governed by differential equations.
+During the learning process, PINNs incorporate a physics-based regularization
+term within the loss function to enhance generalization performance. Since
+simulating dynamics controlled by partial differential equations (PDEs) can be
+computationally expensive, PINNs have gained popularity in learning parametric
+surrogates for fluid flow problems governed by Navier-Stokes equations. In this
+work, we introduce RANS-PINN, a modified PINN framework, to predict flow fields
+(i.e., velocity and pressure) in high Reynolds number turbulent flow regimes.
+To account for the additional complexity introduced by turbulence, RANS-PINN
+employs a 2-equation eddy viscosity model based on a Reynolds-averaged
+Navier-Stokes (RANS) formulation. Furthermore, we adopt a novel training
+approach that ensures effective initialization and balance among the various
+components of the loss function. The effectiveness of the RANS-PINN framework
+is then demonstrated using a parametric PINN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Oracle Teacher: Leveraging Target Information for Better Knowledge
+  Distillation of CTC Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.03664v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.03664v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Won Yoon, Hyung Yong Kim, Hyeonseung Lee, Sunghwan Ahn, Nam Soo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD), best known as an effective method for model
+compression, aims at transferring the knowledge of a bigger network (teacher)
+to a much smaller network (student). Conventional KD methods usually employ the
+teacher model trained in a supervised manner, where output labels are treated
+only as targets. Extending this supervised scheme further, we introduce a new
+type of teacher model for connectionist temporal classification (CTC)-based
+sequence models, namely Oracle Teacher, that leverages both the source inputs
+and the output labels as the teacher model's input. Since the Oracle Teacher
+learns a more accurate CTC alignment by referring to the target information, it
+can provide the student with more optimal guidance. One potential risk for the
+proposed approach is a trivial solution that the model's output directly copies
+the target input. Based on a many-to-one mapping property of the CTC algorithm,
+we present a training strategy that can effectively prevent the trivial
+solution and thus enables utilizing both source and target inputs for model
+training. Extensive experiments are conducted on two sequence learning tasks:
+speech recognition and scene text recognition. From the experimental results,
+we empirically show that the proposed model improves the students across these
+tasks while achieving a considerable speed-up in the teacher model's training
+time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/ACM Transactions on Audio, Speech and Language
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preventing Zero-Shot Transfer Degradation in Continual Learning of
+  Vision-Language Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zangwei Zheng, Mingyuan Ma, Kai Wang, Ziheng Qin, Xiangyu Yue, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) can help pre-trained vision-language models
+efficiently adapt to new or under-trained data distributions without
+re-training. Nevertheless, during the continual training of the Contrastive
+Language-Image Pre-training (CLIP) model, we observe that the model's zero-shot
+transfer ability significantly degrades due to catastrophic forgetting.
+Existing CL methods can mitigate forgetting by replaying previous data.
+However, since the CLIP dataset is private, replay methods cannot access the
+pre-training dataset. In addition, replaying data of previously learned
+downstream tasks can enhance their performance but comes at the cost of
+sacrificing zero-shot performance. To address this challenge, we propose a
+novel method ZSCL to prevent zero-shot transfer degradation in the continual
+learning of vision-language models in both feature and parameter space. In the
+feature space, a reference dataset is introduced for distillation between the
+current and initial models. The reference dataset should have semantic
+diversity but no need to be labeled, seen in pre-training, or matched
+image-text pairs. In parameter space, we prevent a large parameter shift by
+averaging weights during the training. We propose a more challenging
+Multi-domain Task Incremental Learning (MTIL) benchmark to evaluate different
+methods, where tasks are from various domains instead of class-separated in a
+single dataset. Our method outperforms other methods in the traditional
+class-incremental learning setting and the MTIL by 9.7% average score. Our code
+locates at https://github.com/Thunderbeee/ZSCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Diverse Data Types Steganalysis: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar, Mustapha Hemis, Yassine Himeur, David Megías, Abbes Amira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography and steganalysis are two interrelated aspects of the field of
+information security. Steganography seeks to conceal communications, whereas
+steganalysis is aimed to either find them or even, if possible, recover the
+data they contain. Steganography and steganalysis have attracted a great deal
+of interest, particularly from law enforcement. Steganography is often used by
+cybercriminals and even terrorists to avoid being captured while in possession
+of incriminating evidence, even encrypted, since cryptography is prohibited or
+restricted in many countries. Therefore, knowledge of cutting-edge techniques
+to uncover concealed information is crucial in exposing illegal acts. Over the
+last few years, a number of strong and reliable steganography and steganalysis
+techniques have been introduced in the literature. This review paper provides a
+comprehensive overview of deep learning-based steganalysis techniques used to
+detect hidden information within digital media. The paper covers all types of
+cover in steganalysis, including image, audio, and video, and discusses the
+most commonly used deep learning techniques. In addition, the paper explores
+the use of more advanced deep learning techniques, such as deep transfer
+learning (DTL) and deep reinforcement learning (DRL), to enhance the
+performance of steganalysis systems. The paper provides a systematic review of
+recent research in the field, including data sets and evaluation metrics used
+in recent studies. It also presents a detailed analysis of DTL-based
+steganalysis approaches and their performance on different data sets. The
+review concludes with a discussion on the current state of deep learning-based
+steganalysis, challenges, and future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke
+  Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aysen Degerli, Pekka Jakala, Juha Pajula, Milla Immonen, Miguel Bordallo Lopez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stroke is a major cause of mortality and disability worldwide from which one
+in four people are in danger of incurring in their lifetime. The pre-hospital
+stroke assessment plays a vital role in identifying stroke patients accurately
+to accelerate further examination and treatment in hospitals. Accordingly, the
+National Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital
+Stroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests
+for stroke assessment. However, the validity of these tests is skeptical in the
+absence of neurologists and access to healthcare may be limited. Therefore, in
+this study, we propose a motion-aware and multi-attention fusion network
+(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary
+to other studies on stroke detection from video analysis, our study for the
+first time proposes an end-to-end solution from multiple video recordings of
+each subject with a dataset encapsulating stroke, transient ischemic attack
+(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware
+modules to sense the mobility of patients, attention modules to fuse the
+multi-input video data, and 3D convolutional layers to perform diagnosis from
+the attention-based extracted features. Experimental results over the collected
+Stroke-data dataset show that the proposed MAMAF-Net achieves a successful
+detection of stroke with 93.62% sensitivity and 95.33% AUC score.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonparametric Inference under B-bits Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1901.08571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1901.08571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexuan Li, Ruiqi Liu, Ganggang Xu, Zuofeng Shang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical inference based on lossy or incomplete samples is often needed in
+research areas such as signal/image processing, medical image storage, remote
+sensing, signal transmission. In this paper, we propose a nonparametric testing
+procedure based on samples quantized to $B$ bits through a computationally
+efficient algorithm. Under mild technical conditions, we establish the
+asymptotic properties of the proposed test statistic and investigate how the
+testing power changes as $B$ increases. In particular, we show that if $B$
+exceeds a certain threshold, the proposed nonparametric testing procedure
+achieves the classical minimax rate of testing (Shang and Cheng, 2015) for
+spline models. We further extend our theoretical investigations to a
+nonparametric linearity test and an adaptive nonparametric test, expanding the
+applicability of the proposed methods. Extensive simulation studies {together
+with a real-data analysis} are used to demonstrate the validity and
+effectiveness of the proposed tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selecting the number of clusters, clustering models, and algorithms. A
+  unifying approach based on the quadratic discriminant score 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02302v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02302v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Coraggio, Pietro Coretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cluster analysis requires many decisions: the clustering method and the
+implied reference model, the number of clusters and, often, several
+hyper-parameters and algorithms' tunings. In practice, one produces several
+partitions, and a final one is chosen based on validation or selection
+criteria. There exist an abundance of validation methods that, implicitly or
+explicitly, assume a certain clustering notion. Moreover, they are often
+restricted to operate on partitions obtained from a specific method. In this
+paper, we focus on groups that can be well separated by quadratic or linear
+boundaries. The reference cluster concept is defined through the quadratic
+discriminant score function and parameters describing clusters' size, center
+and scatter. We develop two cluster-quality criteria called quadratic scores.
+We show that these criteria are consistent with groups generated from a general
+class of elliptically-symmetric distributions. The quest for this type of
+groups is common in applications. The connection with likelihood theory for
+mixture models and model-based clustering is investigated. Based on bootstrap
+resampling of the quadratic scores, we propose a selection rule that allows
+choosing among many clustering solutions. The proposed method has the
+distinctive advantage that it can compare partitions that cannot be compared
+with other state-of-the-art methods. Extensive numerical experiments and the
+analysis of real data show that, even if some competing methods turn out to be
+superior in some setups, the proposed methodology achieves a better overall
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplemental materials are included at the end of the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ExBEHRT: Extended <span class="highlight-title">Transformer</span> for Electronic Health Records to Predict
+  Disease Subtypes & Progressions <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12364v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12364v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maurice Rupp, Oriane Peter, Thirupathi Pattipaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce ExBEHRT, an extended version of BEHRT (BERT
+applied to electronic health records), and apply different algorithms to
+interpret its results. While BEHRT considers only diagnoses and patient age, we
+extend the feature space to several multimodal records, namely demographics,
+clinical characteristics, vital signs, smoking status, diagnoses, procedures,
+medications, and laboratory tests, by applying a novel method to unify the
+frequencies and temporal dimensions of the different features. We show that
+additional features significantly improve model performance for various
+downstream tasks in different diseases. To ensure robustness, we interpret
+model predictions using an adaptation of expected gradients, which has not been
+previously applied to transformers with EHR data and provides more granular
+interpretations than previous approaches such as feature and token importances.
+Furthermore, by clustering the model representations of oncology patients, we
+show that the model has an implicit understanding of the disease and is able to
+classify patients with the same cancer type into different risk groups. Given
+the additional features and interpretability, ExBEHRT can help make informed
+decisions about disease trajectories, diagnoses, and risk factors of various
+diseases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 Workshop on Trustworthy Machine Learning for Healthcare
+  (Website: https://sites.google.com/view/tml4h2023/accepted-papers )</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trade-off between Over-smoothing and Over-squashing in Deep Graph
+  Neural Networks <span class="chip">CIKM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jhony H. Giraldo, Konstantinos Skianis, Thierry Bouwmans, Fragkiskos D. Malliaros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have succeeded in various computer science
+applications, yet deep GNNs underperform their shallow counterparts despite
+deep learning's success in other domains. Over-smoothing and over-squashing are
+key challenges when stacking graph convolutional layers, hindering deep
+representation learning and information propagation from distant nodes. Our
+work reveals that over-smoothing and over-squashing are intrinsically related
+to the spectral gap of the graph Laplacian, resulting in an inevitable
+trade-off between these two issues, as they cannot be alleviated
+simultaneously. To achieve a suitable compromise, we propose adding and
+removing edges as a viable approach. We introduce the Stochastic Jost and Liu
+Curvature Rewiring (SJLR) algorithm, which is computationally efficient and
+preserves fundamental properties compared to previous curvature-based methods.
+Unlike existing approaches, SJLR performs edge addition and removal during GNN
+training while maintaining the graph unchanged during testing. Comprehensive
+comparisons demonstrate SJLR's competitive performance in addressing
+over-smoothing and over-squashing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for publication at the 32nd ACM
+  International Conference on Information and Knowledge Management (CIKM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modal Contrastive Learning for Multimodal Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14057v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14057v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longzheng Wang, Chuang Zhang, Hongbo Xu, Yongxiu Xu, Xiaohan Xu, Siqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection of multimodal fake news has gained a widespread attention
+recently. Many existing approaches seek to fuse unimodal features to produce
+multimodal news representations. However, the potential of powerful cross-modal
+contrastive learning methods for fake news detection has not been well
+exploited. Besides, how to aggregate features from different modalities to
+boost the performance of the decision-making process is still an open question.
+To address that, we propose COOLANT, a cross-modal contrastive learning
+framework for multimodal fake news detection, aiming to achieve more accurate
+image-text alignment. To further improve the alignment precision, we leverage
+an auxiliary task to soften the loss term of negative samples during the
+contrast process. A cross-modal fusion module is developed to learn the
+cross-modality correlations. An attention mechanism with an attention guidance
+module is implemented to help effectively and interpretably aggregate the
+aligned unimodal representations and the cross-modality correlations. Finally,
+we evaluate the COOLANT and conduct a comparative study on two widely used
+datasets, Twitter and Weibo. The experimental results demonstrate that our
+COOLANT outperforms previous approaches by a large margin and achieves new
+state-of-the-art results on the two datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PENTACET data -- 23 Million Contextual Code Comments and 250,000 SATD
+  comments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Murali Sridharan, Leevi Rantala, Mika Mäntylä
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Self-Admitted Technical Debt (SATD) research utilizes explicit SATD
+features such as 'TODO' and 'FIXME' for SATD detection. A closer look reveals
+several SATD research uses simple SATD ('Easy to Find') code comments without
+the contextual data (preceding and succeeding source code context). This work
+addresses this gap through PENTACET (or 5C dataset) data. PENTACET is a large
+Curated Contextual Code Comments per Contributor and the most extensive SATD
+data. We mine 9,096 Open Source Software Java projects with a total of 435
+million LOC. The outcome is a dataset with 23 million code comments, preceding
+and succeeding source code context for each comment, and more than 250,000
+comments labeled as SATD, including both 'Easy to Find' and 'Hard to Find'
+SATD. We believe PENTACET data will further SATD research using Artificial
+Intelligence techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MSR 2023 Tools and Data Showcase</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faithful Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04431v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04431v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom A. Lamb, Rudy Brunel, Krishnamurthy DJ Dvijotham, M. Pawan Kumar, Philip H. S. Torr, Francisco Eiras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) has received much attention due to its success in
+compressing networks to allow for their deployment in resource-constrained
+systems. While the problem of adversarial robustness has been studied before in
+the KD setting, previous works overlook what we term the relative calibration
+of the student network with respect to its teacher in terms of soft
+confidences. In particular, we focus on two crucial questions with regard to a
+teacher-student pair: (i) do the teacher and student disagree at points close
+to correctly classified dataset examples, and (ii) is the distilled student as
+confident as the teacher around dataset examples? These are critical questions
+when considering the deployment of a smaller student network trained from a
+robust teacher within a safety-critical setting. To address these questions, we
+introduce a faithful imitation framework to discuss the relative calibration of
+confidences and provide empirical and certified methods to evaluate the
+relative calibration of a student w.r.t. its teacher. Further, to verifiably
+align the relative calibration incentives of the student to those of its
+teacher, we introduce faithful distillation. Our experiments on the MNIST,
+Fashion-MNIST and CIFAR-10 datasets demonstrate the need for such an analysis
+and the advantages of the increased verifiability of faithful distillation over
+alternative adversarial distillation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7pgs (main content), 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Graph Representation Learning for Local Corruption Recovery <span class="chip">WWW '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.04936v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.04936v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingxin Zhou, Yuanhong Jiang, Yu Guang Wang, Jingwei Liang, Junbin Gao, Shirui Pan, Xiaoqun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of graph representation learning is affected by the quality
+of graph input. While existing research usually pursues a globally smoothed
+graph embedding, we believe the rarely observed anomalies are as well harmful
+to an accurate prediction. This work establishes a graph learning scheme that
+automatically detects (locally) corrupted feature attributes and recovers
+robust embedding for prediction tasks. The detection operation leverages a
+graph autoencoder, which does not make any assumptions about the distribution
+of the local corruptions. It pinpoints the positions of the anomalous node
+attributes in an unbiased mask matrix, where robust estimations are recovered
+with sparsity promoting regularizer. The optimizer approaches a new embedding
+that is sparse in the framelet domain and conditionally close to input
+observations. Extensive experiments are provided to validate our proposed model
+can recover a robust graph representation from black-box poisoning and achieve
+excellent performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WWW '23: Proceedings of the ACM Web Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Transferability of Adversarial Examples via Direction
+  Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyuan Yang, Jie Lin, Hanlin Zhang, Xinyu Yang, Peng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the transfer-based adversarial attacks, adversarial examples are only
+generated by the surrogate models and achieve effective perturbation in the
+victim models. Although considerable efforts have been developed on improving
+the transferability of adversarial examples generated by transfer-based
+adversarial attacks, our investigation found that, the big deviation between
+the actual and steepest update directions of the current transfer-based
+adversarial attacks is caused by the large update step length, resulting in the
+generated adversarial examples can not converge well. However, directly
+reducing the update step length will lead to serious update oscillation so that
+the generated adversarial examples also can not achieve great transferability
+to the victim models. To address these issues, a novel transfer-based attack,
+namely direction tuning attack, is proposed to not only decrease the update
+deviation in the large step length, but also mitigate the update oscillation in
+the small sampling step length, thereby making the generated adversarial
+examples converge well to achieve great transferability on victim models. In
+addition, a network pruning method is proposed to smooth the decision boundary,
+thereby further decreasing the update oscillation and enhancing the
+transferability of the generated adversarial examples. The experiment results
+on ImageNet demonstrate that the average attack success rate (ASR) of the
+adversarial examples generated by our method can be improved from 87.9\% to
+94.5\% on five victim models without defenses, and from 69.1\% to 76.2\% on
+eight advanced defense methods, in comparison with that of latest
+gradient-based attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by INS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Design Fundamentals of Diffusion Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04542v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04542v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Chang, George Alex Koulieris, Hubert P. H. Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are generative models, which gradually add and remove noise
+to learn the underlying distribution of training data for data generation. The
+components of diffusion models have gained significant attention with many
+design choices proposed. Existing reviews have primarily focused on
+higher-level solutions, thereby covering less on the design fundamentals of
+components. This study seeks to address this gap by providing a comprehensive
+and coherent review on component-wise design choices in diffusion models.
+Specifically, we organize this review according to their three key components,
+namely the forward process, the reverse process, and the sampling procedure.
+This allows us to provide a fine-grained perspective of diffusion models,
+benefiting future studies in the analysis of individual components, the
+applicability of design choices, and the implementation of diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pretrain</span>ing Respiratory Sound Representations using Metadata and
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16192v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16192v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilyass Moummad, Nicolas Farrugia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods based on supervised learning using annotations in an end-to-end
+fashion have been the state-of-the-art for classification problems. However,
+they may be limited in their generalization capability, especially in the low
+data regime. In this study, we address this issue using supervised contrastive
+learning combined with available metadata to solve multiple pretext tasks that
+learn a good representation of data. We apply our approach on respiratory sound
+classification. This task is suited for this setting as demographic information
+such as sex and age are correlated with presence of lung diseases, and learning
+a system that implicitly encode this information may better detect anomalies.
+Supervised contrastive learning is a paradigm that learns similar
+representations to samples sharing the same class labels and dissimilar
+representations to samples with different class labels. The feature extractor
+learned using this paradigm extract useful features from the data, and we show
+that it outperforms cross-entropy in classifying respiratory anomalies in two
+different datasets. We also show that learning representations using only
+metadata, without class labels, obtains similar performance as using cross
+entropy with those labels only. In addition, when combining class labels with
+metadata using multiple supervised contrastive learning, an extension of
+supervised contrastive learning solving an additional task of grouping patients
+within the same sex and age group, more informative features are learned. This
+work suggests the potential of using multiple metadata sources in supervised
+contrastive settings, in particular in settings with class imbalance and few
+data. Our code is released at https://github.com/ilyassmoummad/scl_icbhi2017
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Network Sensitivity Under Probabilistic Error Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.07831v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.07831v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinjue Wang, Esa Ollila, Sergiy A. Vorobyov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) can successfully learn the graph signal
+representation by graph convolution. The graph convolution depends on the graph
+filter, which contains the topological dependency of data and propagates data
+features. However, the estimation errors in the propagation matrix (e.g., the
+adjacency matrix) can have a significant impact on graph filters and GCNs. In
+this paper, we study the effect of a probabilistic graph error model on the
+performance of the GCNs. We prove that the adjacency matrix under the error
+model is bounded by a function of graph size and error probability. We further
+analytically specify the upper bound of a normalized adjacency matrix with
+self-loop added. Finally, we illustrate the error bounds by running experiments
+on a synthetic dataset and study the sensitivity of a simple GCN under this
+probabilistic error model on accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object
+  Detection <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.15157v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.15157v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Broedermann, Christos Sakaridis, Dengxin Dai, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Besides standard cameras, autonomous vehicles typically include multiple
+additional sensors, such as lidars and radars, which help acquire richer
+information for perceiving the content of the driving scene. While several
+recent works focus on fusing certain pairs of sensors - such as camera with
+lidar or radar - by using architectural components specific to the examined
+setting, a generic and modular sensor fusion architecture is missing from the
+literature. In this work, we propose HRFuser, a modular architecture for
+multi-modal 2D object detection. It fuses multiple sensors in a
+multi-resolution fashion and scales to an arbitrary number of input modalities.
+The design of HRFuser is based on state-of-the-art high-resolution networks for
+image-only dense prediction and incorporates a novel multi-window
+cross-attention block as the means to perform fusion of multiple modalities at
+multiple resolutions. We demonstrate via extensive experiments on nuScenes and
+the adverse conditions DENSE datasets that our model effectively leverages
+complementary features from additional modalities, substantially improving upon
+camera-only performance and consistently outperforming state-of-the-art 3D and
+2D fusion methods evaluated on 2D object detection metrics. The source code is
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE International Conference on Intelligent Transportation Systems
+  (ITSC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning representations that are closed-form Monge mapping optimal with
+  application to domain adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Struckmeier, Ievgen Redko, Anton Mallasto, Karol Arndt, Markus Heinonen, Ville Kyrki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transport (OT) is a powerful geometric tool used to compare and align
+probability measures following the least effort principle. Despite its
+widespread use in machine learning (ML), OT problem still bears its
+computational burden, while at the same time suffering from the curse of
+dimensionality for measures supported on general high-dimensional spaces. In
+this paper, we propose to tackle these challenges using representation
+learning. In particular, we seek to learn an embedding space such that the
+samples of the two input measures become alignable in it with a simple affine
+mapping that can be calculated efficiently in closed-form. We then show that
+such approach leads to results that are comparable to solving the original OT
+problem when applied to the transfer learning task on which many OT baselines
+where previously evaluated in both homogeneous and heterogeneous DA settings.
+The code for our contribution is available at
+\url{https://github.com/Oleffa/LaOT}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constraining Linear-chain CRFs to Regular Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.07306v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.07306v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Papay, Roman Klinger, Sebastian Padó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in structured prediction is to represent the
+interdependencies within output structures. When outputs are structured as
+sequences, linear-chain conditional random fields (CRFs) are a widely used
+model class which can learn \textit{local} dependencies in the output. However,
+the CRF's Markov assumption makes it impossible for CRFs to represent
+distributions with \textit{nonlocal} dependencies, and standard CRFs are unable
+to respect nonlocal constraints of the data (such as global arity constraints
+on output labels). We present a generalization of CRFs that can enforce a broad
+class of constraints, including nonlocal ones, by specifying the space of
+possible output structures as a regular language $\mathcal{L}$. The resulting
+regular-constrained CRF (RegCCRF) has the same formal properties as a standard
+CRF, but assigns zero probability to all label sequences not in $\mathcal{L}$.
+Notably, RegCCRFs can incorporate their constraints during training, while
+related models only enforce constraints during decoding. We prove that
+constrained training is never worse than constrained decoding, and show
+empirically that it can be substantially better in practice. Additionally, we
+demonstrate a practical benefit on downstream tasks by incorporating a RegCCRF
+into a deep neural model for semantic role labeling, exceeding state-of-the-art
+results on a standard dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating artificial digital image correlation data using
+  physics-guided adversarial networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15939v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15939v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Melching, Erik Schultheis, Eric Breitbarth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital image correlation (DIC) has become a valuable tool in the evaluation
+of mechanical experiments, particularly fatigue crack growth experiments. The
+evaluation requires accurate information of the crack path and crack tip
+position, which is difficult to obtain due to inherent noise and artefacts.
+Machine learning models have been extremely successful in recognizing this
+relevant information. But for the training of robust models, which generalize
+well, big data is needed. However, data is typically scarce in the field of
+material science and engineering because experiments are expensive and
+time-consuming. We present a method to generate synthetic DIC data using
+generative adversarial networks with a physics-guided discriminator. To decide
+whether data samples are real or fake, this discriminator additionally receives
+the derived von Mises equivalent strain. We show that this physics-guided
+approach leads to improved results in terms of visual quality of samples,
+sliced Wasserstein distance, and geometry score.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalised Language Modelling of Screen Characters Using Rich Metadata
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16618v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16618v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Vincent, Rowanne Sumner, Alice Dowek, Charlotte Blundell, Emily Preston, Chris Bayliss, Chris Oakley, Carolina Scarton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models that are sensitive to external context can more effectively
+capture the speaking patterns of individuals with specific characteristics or
+in particular environments. However, obtaining and leveraging such annotations
+can be challenging. In this work, we show how to leverage rich character and
+film annotations to personalise language models in a scalable manner. Our best
+model can reduce perplexity by up to 6.5% compared to a parameter-matched
+language model. Our approach performs on par with speaker-specific fine-tuning
+when the fine-tuning data (i.e. past dialogue) for individual speakers is
+available. On top of that, it also generalises well to a scenario with no such
+data, relying on combinations of demographic characteristics expressed via
+metadata. Our findings are consistent across two corpora, one of which is also
+a contribution of this paper: Cornell-rich contains rich manual annotations for
+863 speaking characters from the Cornell Movie Dialog Corpus, including
+features such as characteristic quotes and character descriptions, along with
+six automatically extracted metadata features for over 95% of the featured
+films. Finally, we also present a cost-benefit analysis highlighting which
+annotations are most cost-effective in reducing perplexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages; 4 figures; 6 tables. Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verifying the Robustness of Automatic Credibility Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Przybyła, Alexander Shvets, Horacio Saggion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text classification methods have been widely investigated as a way to detect
+content of low credibility: fake news, social media bots, propaganda, etc.
+Quite accurate models (likely based on deep neural networks) help in moderating
+public electronic platforms and often cause content creators to face rejection
+of their submissions or removal of already published texts. Having the
+incentive to evade further detection, content creators try to come up with a
+slightly modified version of the text (known as an attack with an adversarial
+example) that exploit the weaknesses of classifiers and result in a different
+output. Here we systematically test the robustness of popular text classifiers
+against available attacking techniques and discover that, indeed, in some cases
+insignificant changes in input text can mislead the models. We also introduce
+BODEGA: a benchmark for testing both victim models and attack methods on four
+misinformation detection tasks in an evaluation framework designed to simulate
+real use-cases of content moderation. Finally, we manually analyse a subset
+adversarial examples and check what kinds of modifications are used in
+successful attacks. The BODEGA code and data is openly shared in hope of
+enhancing the comparability and replicability of further research in this area
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing the Robustness via Adversarial Learning and Joint
+  Spatial-Temporal Embeddings in Traffic Forecasting <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.03063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.03063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juyong Jiang, Binqing Wu, Ling Chen, Kai Zhang, Sunghun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic forecasting is an essential problem in urban planning and computing.
+The complex dynamic spatial-temporal dependencies among traffic objects (e.g.,
+sensors and road segments) have been calling for highly flexible models;
+unfortunately, sophisticated models may suffer from poor robustness especially
+in capturing the trend of the time series (1st-order derivatives with time),
+leading to unrealistic forecasts. To address the challenge of balancing
+dynamics and robustness, we propose TrendGCN, a new scheme that extends the
+flexibility of GCNs and the distribution-preserving capacity of generative and
+adversarial loss for handling sequential data with inherent statistical
+correlations. On the one hand, our model simultaneously incorporates spatial
+(node-wise) embeddings and temporal (time-wise) embeddings to account for
+heterogeneous space-and-time convolutions; on the other hand, it uses GAN
+structure to systematically evaluate statistical consistencies between the real
+and the predicted time series in terms of both the temporal trending and the
+complex spatial-temporal dependencies. Compared with traditional approaches
+that handle step-wise predictive errors independently, our approach can produce
+more realistic and robust forecasts. Experiments on six benchmark traffic
+forecasting datasets and theoretical analysis both demonstrate the superiority
+and the state-of-the-art performance of TrendGCN. Source code is available at
+https://github.com/juyongjiang/TrendGCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Multi-view Unsupervised Feature Selection and Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.08247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.08247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Si-Guo Fang, Dong Huang, Chang-Dong Wang, Yong Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant progress, previous multi-view unsupervised feature
+selection methods mostly suffer from two limitations. First, they generally
+utilize either cluster structure or similarity structure to guide the feature
+selection, which neglect the possibility of a joint formulation with mutual
+benefits. Second, they often learn the similarity structure by either global
+structure learning or local structure learning, which lack the capability of
+graph learning with both global and local structural awareness. In light of
+this, this paper presents a joint multi-view unsupervised feature selection and
+graph learning (JMVFG) approach. Particularly, we formulate the multi-view
+feature selection with orthogonal decomposition, where each target matrix is
+decomposed into a view-specific basis matrix and a view-consistent cluster
+indicator. The cross-space locality preservation is incorporated to bridge the
+cluster structure learning in the projected space and the similarity learning
+(i.e., graph learning) in the original space. Further, a unified objective
+function is presented to enable the simultaneous learning of the cluster
+structure, the global and local similarity structures, and the multi-view
+consistency and inconsistency, upon which an alternating optimization algorithm
+is developed with theoretically proved convergence. Extensive experiments on a
+variety of real-world multi-view datasets demonstrate the superiority of our
+approach for both the multi-view feature selection and graph learning tasks.
+The code is available at https://github.com/huangdonghere/JMVFG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in IEEE Transactions on Emerging Topics in Computational
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Causal Representation Learning and Deconfounding from Indefinite
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02640v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02640v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Chen, Xinyu Yang, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the cross-pollination between causal discovery and deep learning,
+non-statistical data (e.g., images, text, etc.) encounters significant
+conflicts in terms of properties and methods with traditional causal data. To
+unify these data types of varying forms, we redefine causal data from two novel
+perspectives and then propose three data paradigms. Among them, the indefinite
+data (like dialogues or video sources) induce low sample utilization and
+incapability of the distribution assumption, both leading to the fact that
+learning causal representation from indefinite data is, as of yet, largely
+unexplored. We design the causal strength variational model to settle down
+these two problems. Specifically, we leverage the causal strength instead of
+independent noise as the latent variable to construct evidence lower bound. By
+this design ethos, The causal strengths of different structures are regarded as
+a distribution and can be expressed as a 2D matrix. Moreover, considering the
+latent confounders, we disentangle the causal graph G into two relation
+subgraphs O and C. O contains pure relations between observed variables, while
+C represents the relations from latent variables to observed variables. We
+implement the above designs as a dynamic variational inference model, tailored
+to learn causal representation from indefinite data under latent confounding.
+Finally, we conduct comprehensive experiments on synthetic and real-world data
+to demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning
+  Consistent and Contrastive Feature Representations <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01558v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01558v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Pérez-Carrasco, Pavlos Protopapas, Guillermo Cabrera-Vives
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present Con$^{2}$DA, a simple framework that extends recent
+advances in semi-supervised learning to the semi-supervised domain adaptation
+(SSDA) problem. Our framework generates pairs of associated samples by
+performing stochastic data transformations to a given input. Associated data
+pairs are mapped to a feature representation space using a feature extractor.
+We use different loss functions to enforce consistency between the feature
+representations of associated data pairs of samples. We show that these learned
+representations are useful to deal with differences in data distributions in
+the domain adaptation problem. We performed experiments to study the main
+components of our model and we show that (i) learning of the consistent and
+contrastive feature representations is crucial to extract good discriminative
+features across different domains, and ii) our model benefits from the use of
+strong augmentation policies. With these findings, our method achieves
+state-of-the-art performances in three benchmark datasets for SSDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting
+  Methods and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECLAD: Extracting Concepts with Local Aggregated Descriptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04531v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04531v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Felipe Posada-Moreno, Nikita Surya, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) are increasingly being used in critical
+systems, where robustness and alignment are crucial. In this context, the field
+of explainable artificial intelligence has proposed the generation of
+high-level explanations of the prediction process of CNNs through concept
+extraction. While these methods can detect whether or not a concept is present
+in an image, they are unable to determine its location. What is more, a fair
+comparison of such approaches is difficult due to a lack of proper validation
+procedures. To address these issues, we propose a novel method for automatic
+concept extraction and localization based on representations obtained through
+pixel-wise aggregations of CNN activation maps. Further, we introduce a process
+for the validation of concept-extraction techniques based on synthetic datasets
+with pixel-wise annotations of their main components, reducing the need for
+human intervention. Extensive experimentation on both synthetic and real-world
+datasets demonstrates that our method outperforms state-of-the-art
+alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trainable Weight Averaging: A General Approach for Subspace Training <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Li, Zhehao Huang, Yingwen Wu, Zhengbao He, Qinghua Tao, Xiaolin Huang, Chih-Jen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training deep neural networks (DNNs) in low-dimensional subspaces is a
+promising direction for achieving efficient training and better generalization
+performance. Our previous work extracts the subspaces by performing the
+dimension reduction method over the training trajectory, which verifies that
+DNN could be well-trained in a tiny subspace. However, that method is
+inefficient for subspace extraction and numerically unstable, limiting its
+applicability to more general tasks. In this paper, we connect subspace
+training to weight averaging and propose \emph{Trainable Weight Averaging}
+(TWA), a general approach for subspace training. TWA is efficient in terms of
+subspace extraction and easy to use, making it a promising new optimizer for
+DNN's training. Our design also includes an efficient scheme that allows
+parallel training across multiple nodes to handle large-scale problems and
+evenly distribute the memory and computation burden to each node. TWA can be
+used for both efficient training and generalization enhancement, for different
+neural network architectures, and for various tasks from image classification
+and object detection, to neural language processing. The code of implementation
+is available at https://github.com/nblt/TWA, which includes extensive
+experiments covering various benchmark computer vision and neural language
+processing tasks with various architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal version in progress. Previously accepted to ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neural-Network-Based Convex Regularizer for Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis Goujon, Sebastian Neumayer, Pakshal Bohra, Stanislas Ducotterd, Michael Unser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of deep-learning-based methods to solve image-reconstruction
+problems has enabled a significant increase in reconstruction quality.
+Unfortunately, these new methods often lack reliability and explainability, and
+there is a growing interest to address these shortcomings while retaining the
+boost in performance. In this work, we tackle this issue by revisiting
+regularizers that are the sum of convex-ridge functions. The gradient of such
+regularizers is parameterized by a neural network that has a single hidden
+layer with increasing and learnable activation functions. This neural network
+is trained within a few minutes as a multistep Gaussian denoiser. The numerical
+experiments for denoising, CT, and MRI reconstruction show improvements over
+methods that offer similar reliability guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Lane Detection through Self <span class="highlight-title">Pre-train</span>ing with Masked Sequential
+  Autoencoders and Fine-tuning with Customized PolyLoss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruohan Li, Yongqi Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane detection is crucial for vehicle localization which makes it the
+foundation for automated driving and many intelligent and advanced driving
+assistant systems. Available vision-based lane detection methods do not make
+full use of the valuable features and aggregate contextual information,
+especially the interrelationships between lane lines and other regions of the
+images in continuous frames. To fill this research gap and upgrade lane
+detection performance, this paper proposes a pipeline consisting of self
+pre-training with masked sequential autoencoders and fine-tuning with
+customized PolyLoss for the end-to-end neural network models using
+multi-continuous image frames. The masked sequential autoencoders are adopted
+to pre-train the neural network models with reconstructing the missing pixels
+from a random masked image as the objective. Then, in the fine-tuning
+segmentation phase where lane detection segmentation is performed, the
+continuous image frames are served as the inputs, and the pre-trained model
+weights are transferred and further updated using the backpropagation mechanism
+with customized PolyLoss calculating the weighted errors between the output
+lane detection results and the labeled ground truth. Extensive experiment
+results demonstrate that, with the proposed pipeline, the lane detection model
+performance on both normal and challenging scenes can be advanced beyond the
+state-of-the-art, delivering the best testing accuracy (98.38%), precision
+(0.937), and F1-measure (0.924) on the normal scene testing set, together with
+the best overall accuracy (98.36%) and precision (0.844) in the challenging
+scene test set, while the training time can be substantially shortened.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, accepted by journal of IEEE Transactions on
+  Intelligent Transportation Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Quadruped Jumping via Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.07089v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.07089v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Bellegarda, Chuong Nguyen, Quan Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider a general task of jumping varying distances and
+heights for a quadrupedal robot in noisy environments, such as off of uneven
+terrain and with variable robot dynamics parameters. To accurately jump in such
+conditions, we propose a framework using deep reinforcement learning that
+leverages and augments the complex solution of nonlinear trajectory
+optimization for quadrupedal jumping. While the standalone optimization limits
+jumping to take-off from flat ground and requires accurate assumptions of robot
+dynamics, our proposed approach improves the robustness to allow jumping off of
+significantly uneven terrain with variable robot dynamical parameters and
+environmental conditions. Compared with walking and running, the realization of
+aggressive jumping on hardware necessitates accounting for the motors'
+torque-speed relationship as well as the robot's total power limits. By
+incorporating these constraints into our learning framework, we successfully
+deploy our policy sim-to-real without further tuning, fully exploiting the
+available onboard power supply and motors. We demonstrate robustness to
+environment noise of foot disturbances of up to 6 cm in height, or 33% of the
+robot's nominal standing height, while jumping 2x the body length in distance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine learning methods for the search for L&T brown dwarfs in the data
+  of modern sky <span class="highlight-title">survey</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03045v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03045v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksandra Avdeeva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  According to various estimates, brown dwarfs (BD) should account for up to 25
+percent of all objects in the Galaxy. However, few of them are discovered and
+well-studied, both individually and as a population. Homogeneous and complete
+samples of brown dwarfs are needed for these kinds of studies. Due to their
+weakness, spectral studies of brown dwarfs are rather laborious. For this
+reason, creating a significant reliable sample of brown dwarfs, confirmed by
+spectroscopic observations, seems unattainable at the moment. Numerous attempts
+have been made to search for and create a set of brown dwarfs using their
+colours as a decision rule applied to a vast amount of survey data. In this
+work, we use machine learning methods such as Random Forest Classifier,
+XGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to
+distinguish L and T brown dwarfs from objects of other spectral and luminosity
+classes. The explanation of the models is discussed. We also compare our models
+with classical decision rules, proving their efficiency and relevance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM As DBA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Guoliang Li, Zhiyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Database administrators (DBAs) play a crucial role in managing, maintaining
+and optimizing a database system to ensure data availability, performance, and
+reliability. However, it is hard and tedious for DBAs to manage a large number
+of database instances (e.g., millions of instances on the cloud databases).
+Recently large language models (LLMs) have shown great potential to understand
+valuable documents and accordingly generate reasonable answers. Thus, we
+propose D-Bot, a LLM-based database administrator that can continuously acquire
+database maintenance experience from textual sources, and provide reasonable,
+well-founded, in-time diagnosis and optimization advice for target databases.
+This paper presents a revolutionary LLM-centric framework for database
+maintenance, including (i) database maintenance knowledge detection from
+documents and tools, (ii) tree of thought reasoning for root cause analysis,
+and (iii) collaborative diagnosis among multiple LLMs. Our preliminary
+experimental results that D-Bot can efficiently and effectively diagnose the
+root causes and our code is available at
+github.com/TsinghuaDatabaseGroup/DB-GPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Popularity Bias in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Klimashevskaia, Dietmar Jannach, Mehdi Elahi, Christoph Trattner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems help people find relevant content in a personalized way.
+One main promise of such systems is that they are able to increase the
+visibility of items in the long tail, i.e., the lesser-known items in a
+catalogue. Existing research, however, suggests that in many situations today's
+recommendation algorithms instead exhibit a popularity bias, meaning that they
+often focus on rather popular items in their recommendations. Such a bias may
+not only lead to limited value of the recommendations for consumers and
+providers in the short run, but it may also cause undesired reinforcement
+effects over time. In this paper, we discuss the potential reasons for
+popularity bias and we review existing approaches to detect, quantify and
+mitigate popularity bias in recommender systems. Our survey therefore includes
+both an overview of the computational metrics used in the literature as well as
+a review of the main technical approaches to reduce the bias. We furthermore
+critically discuss today's literature, where we observe that the research is
+almost entirely based on computational experiments and on certain assumptions
+regarding the practical effects of including long-tail items in the
+recommendations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Dynamics Theory of Implicit Regularization in Deep Low-Rank Matrix
+  Factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.14150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.14150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Cao, Chen Qian, Yihui Huang, Dicheng Chen, Yuncheng Gao, Jiyang Dong, Di Guo, Xiaobo Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit regularization is an important way to interpret neural networks.
+Recent theory starts to explain implicit regularization with the model of deep
+matrix factorization (DMF) and analyze the trajectory of discrete gradient
+dynamics in the optimization process. These discrete gradient dynamics are
+relatively small but not infinitesimal, thus fitting well with the practical
+implementation of neural networks. Currently, discrete gradient dynamics
+analysis has been successfully applied to shallow networks but encounters the
+difficulty of complex computation for deep networks. In this work, we introduce
+another discrete gradient dynamics approach to explain implicit regularization,
+i.e. landscape analysis. It mainly focuses on gradient regions, such as saddle
+points and local minima. We theoretically establish the connection between
+saddle point escaping (SPE) stages and the matrix rank in DMF. We prove that,
+for a rank-R matrix reconstruction, DMF will converge to a second-order
+critical point after R stages of SPE. This conclusion is further experimentally
+verified on a low-rank matrix reconstruction problem. This work provides a new
+theory to analyze implicit regularization in deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Cover Time Study of a non-Markovian Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanhua Fang, Gennady Samorodnitsky, Zhiqiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a traversal algorithm, cover time is the expected number of steps
+needed to visit all nodes in a given graph. A smaller cover time means a higher
+exploration efficiency of traversal algorithm. Although random walk algorithms
+have been studied extensively in the existing literature, there has been no
+cover time result for any non-Markovian method. In this work, we stand on a
+theoretical perspective and show that the negative feedback strategy (a
+count-based exploration method) is better than the naive random walk search. In
+particular, the former strategy can locally improve the search efficiency for
+an arbitrary graph. It also achieves smaller cover times for special but
+important graphs, including clique graphs, tree graphs, etc. Moreover, we make
+connections between our results and reinforcement learning literature to give
+new insights on why classical UCB and MCTS algorithms are so useful. Various
+numerical results corroborate our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability
+  in Visual Clustering <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Ghulam Jilani Quadri, Hyunwook Lee, Paul Rosen, Danielle Albers Szafir, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual clustering is a common perceptual task in scatterplots that supports
+diverse analytics tasks (e.g., cluster identification). However, even with the
+same scatterplot, the ways of perceiving clusters (i.e., conducting visual
+clustering) can differ due to the differences among individuals and ambiguous
+cluster boundaries. Although such perceptual variability casts doubt on the
+reliability of data analysis based on visual clustering, we lack a systematic
+way to efficiently assess this variability. In this research, we study
+perceptual variability in conducting visual clustering, which we call Cluster
+Ambiguity. To this end, we introduce CLAMS, a data-driven visual quality
+measure for automatically predicting cluster ambiguity in monochrome
+scatterplots. We first conduct a qualitative study to identify key factors that
+affect the visual separation of clusters (e.g., proximity or size difference
+between clusters). Based on study findings, we deploy a regression module that
+estimates the human-judged separability of two clusters. Then, CLAMS predicts
+cluster ambiguity by analyzing the aggregated results of all pairwise
+separability between clusters that are generated by the module. CLAMS
+outperforms widely-used clustering techniques in predicting ground truth
+cluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human
+annotators. We conclude our work by presenting two applications for optimizing
+and benchmarking data mining techniques using CLAMS. The interactive demo of
+CLAMS is available at clusterambiguity.dev.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Visualization and Computer Graphics (TVCG)
+  (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani
+  Quadri</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZADU: A Python Library for Evaluating the Reliability of Dimensionality
+  Reduction Embeddings <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Aeri Cho, Jinhwa Jang, Soohyun Lee, Jake Hyun, Hyung-Kwon Ko, Jaemin Jo, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dimensionality reduction (DR) techniques inherently distort the original
+structure of input high-dimensional data, producing imperfect low-dimensional
+embeddings. Diverse distortion measures have thus been proposed to evaluate the
+reliability of DR embeddings. However, implementing and executing distortion
+measures in practice has so far been time-consuming and tedious. To address
+this issue, we present ZADU, a Python library that provides distortion
+measures. ZADU is not only easy to install and execute but also enables
+comprehensive evaluation of DR embeddings through three key features. First,
+the library covers a wide range of distortion measures. Second, it
+automatically optimizes the execution of distortion measures, substantially
+reducing the running time required to execute multiple measures. Last, the
+library informs how individual points contribute to the overall distortions,
+facilitating the detailed analysis of DR embeddings. By simulating a real-world
+scenario of optimizing DR embeddings, we verify that our optimization scheme
+substantially reduces the time required to execute distortion measures.
+Finally, as an application of ZADU, we present another library called ZADUVis
+that allows users to easily create distortion visualizations that depict the
+extent to which each region of an embedding suffers from distortions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classes are not Clusters: Improving Label-based Evaluation of
+  Dimensionality Reduction <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Yun-Hsin Kuo, Michaël Aupetit, Kwan-Liu Ma, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common way to evaluate the reliability of dimensionality reduction (DR)
+embeddings is to quantify how well labeled classes form compact, mutually
+separated clusters in the embeddings. This approach is based on the assumption
+that the classes stay as clear clusters in the original high-dimensional space.
+However, in reality, this assumption can be violated; a single class can be
+fragmented into multiple separated clusters, and multiple classes can be merged
+into a single cluster. We thus cannot always assure the credibility of the
+evaluation using class labels. In this paper, we introduce two novel quality
+measures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing
+the process of DR evaluation based on class labels. Instead of assuming that
+classes are well-clustered in the original space, Label-T&C work by (1)
+estimating the extent to which classes form clusters in the original and
+embedded spaces and (2) evaluating the difference between the two. A
+quantitative evaluation showed that Label-T&C outperform widely used DR
+evaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler
+divergence) in terms of the accuracy in assessing how well DR embeddings
+preserve the cluster structure, and are also scalable. Moreover, we present
+case studies demonstrating that Label-T&C can be successfully used for
+revealing the intrinsic characteristics of DR techniques and their
+hyperparameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Visualization and Computer Graphics (TVCG)
+  (Proc. IEEE VIS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Initial State Interventions for Deconfounded Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15980v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15980v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Pfrommer, Yatong Bai, Hyunin Lee, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning suffers from causal confusion. This phenomenon occurs when
+learned policies attend to features that do not causally influence the expert
+actions but are instead spuriously correlated. Causally confused agents produce
+low open-loop supervised loss but poor closed-loop performance upon deployment.
+We consider the problem of masking observed confounders in a disentangled
+representation of the observation space. Our novel masking algorithm leverages
+the usual ability to intervene in the initial system state, avoiding any
+requirement involving expert querying, expert reward functions, or causal graph
+specification. Under certain assumptions, we theoretically prove that this
+algorithm is conservative in the sense that it does not incorrectly mask
+observations that causally influence the expert; furthermore, intervening on
+the initial state serves to strictly reduce excess conservatism. The masking
+algorithm is applied to behavior cloning for two illustrative control systems:
+CartPole and Reacher.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>62nd IEEE Conference on Decision and Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Coordinate Projection Network for Sparse-View Computed
+  Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Wu, Ruimin Feng, Hongjiang Wei, Jingyi Yu, Yuyao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the present work, we propose a Self-supervised COordinate Projection
+nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV
+sinogram by solving the inverse tomography imaging problem. Compared with
+recent related works that solve similar problems using implicit neural
+representation network (INR), our essential contribution is an effective and
+simple re-projection strategy that pushes the tomography image reconstruction
+quality over supervised deep learning CT reconstruction works. The proposed
+strategy is inspired by the simple relationship between linear algebra and
+inverse problems. To solve the under-determined linear equation system, we
+first introduce INR to constrain the solution space via image continuity prior
+and achieve a rough solution. And secondly, we propose to generate a dense view
+sinogram that improves the rank of the linear equation system and produces a
+more stable CT image solution space. Our experiment results demonstrate that
+the re-projection strategy significantly improves the image reconstruction
+quality (+3 dB for PSNR at least). Besides, we integrate the recent hash
+encoding into our SCOPE model, which greatly accelerates the model training.
+Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction
+tasks. Experimental results indicate that the proposed SCOPE model outperforms
+two latest INR-based methods and two well-popular supervised DL methods
+quantitatively and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unconstrained Dynamic Regret via Sparse Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13349v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13349v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Zhang, Ashok Cutkosky, Ioannis Ch. Paschalidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the challenge of nonstationarity in sequential decision making,
+we study Online Convex Optimization (OCO) under the coupling of two problem
+structures: the domain is unbounded, and the comparator sequence
+$u_1,\ldots,u_T$ is arbitrarily time-varying. As no algorithm can guarantee low
+regret simultaneously against all comparator sequences, handling this setting
+requires moving from minimax optimality to comparator adaptivity. That is,
+sensible regret bounds should depend on certain complexity measures of the
+comparator relative to one's prior knowledge.
+  This paper achieves a new type of these adaptive regret bounds via a sparse
+coding framework. The complexity of the comparator is measured by its energy
+and its sparsity on a user-specified dictionary, which offers considerable
+versatility. Equipped with a wavelet dictionary for example, our framework
+improves the state-of-the-art bound (Jacobsen & Cutkosky, 2022) by adapting to
+both ($i$) the magnitude of the comparator average $||\bar
+u||=||\sum_{t=1}^Tu_t/T||$, rather than the maximum $\max_t||u_t||$; and ($ii$)
+the comparator variability $\sum_{t=1}^T||u_t-\bar u||$, rather than the
+uncentered sum $\sum_{t=1}^T||u_t||$. Furthermore, our analysis is simpler due
+to decoupling function approximation from regret minimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Small technical improvements + fixing typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NIPD: A Federated Learning Person Detection Benchmark Based on
+  Real-World Non-IID Data <span class="chip">IJCAI 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangning Yin, Zhen Ding, Zhihua Dong, Dongsheng Chen, Jie Fu, Xinhui Ji, Guangqiang Yin, Zhiguo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL), a privacy-preserving distributed machine learning,
+has been rapidly applied in wireless communication networks. FL enables
+Internet of Things (IoT) clients to obtain well-trained models while preventing
+privacy leakage. Person detection can be deployed on edge devices with limited
+computing power if combined with FL to process the video data directly at the
+edge. However, due to the different hardware and deployment scenarios of
+different cameras, the data collected by the camera present non-independent and
+identically distributed (non-IID), and the global model derived from FL
+aggregation is less effective. Meanwhile, existing research lacks public data
+set for real-world FL object detection, which is not conducive to studying the
+non-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person
+detection (NIPD) data set, which is collected from five different cameras. To
+our knowledge, this is the first true device-based non-IID person detection
+data set. Based on this data set, we explain how to establish a FL experimental
+platform and provide a benchmark for non-IID person detection. NIPD is expected
+to promote the application of FL and the security of smart city.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hard Sample Mining Enabled Supervised Contrastive Feature Learning for
+  Wind Turbine Pitch System Fault Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Wang, Bo Qin, Mengxuan Li, Chenlu Zhan, Mark D. Butala, Peng Peng, Hongwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficient utilization of wind power by wind turbines relies on the
+ability of their pitch systems to adjust blade pitch angles in response to
+varying wind speeds. However, the presence of multiple health conditions in the
+pitch system due to the long-term wear and tear poses challenges in accurately
+classifying them, thus increasing the maintenance cost of wind turbines or even
+damaging them. This paper proposes a novel method based on hard sample
+mining-enabled supervised contrastive learning (HSMSCL) to address this
+problem. The proposed method employs cosine similarity to identify hard samples
+and subsequently, leverages supervised contrastive learning to learn more
+discriminative representations by constructing hard sample pairs. Furthermore,
+the hard sample mining framework in the proposed method also constructs hard
+samples with learned representations to make the training process of the
+multilayer perceptron (MLP) more challenging and make it a more effective
+classifier. The proposed approach progressively improves the fault diagnosis
+model by introducing hard samples in the SCL and MLP phases, thus enhancing its
+performance in complex multi-class fault diagnosis tasks.
+  To evaluate the effectiveness of the proposed method, two real datasets
+comprising wind turbine pitch system cog belt fracture data are utilized. The
+fault diagnosis performance of the proposed method is compared against existing
+methods, and the results demonstrate its superior performance. The proposed
+approach exhibits significant improvements in fault diagnosis performance,
+providing promising prospects for enhancing the reliability and efficiency of
+wind turbine pitch system fault diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Completeness of Atomic Structure Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jigyasa Nigam, Sergey N. Pozdnyakov, Kevin K. Huguenin-Dumittan, Michele Ceriotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenge of obtaining a comprehensive and
+symmetric representation of point particle groups, such as atoms in a molecule,
+which is crucial in physics and theoretical chemistry. The problem has become
+even more important with the widespread adoption of machine-learning techniques
+in science, as it underpins the capacity of models to accurately reproduce
+physical relationships while being consistent with fundamental symmetries and
+conservation laws. However, the descriptors that are commonly used to represent
+point clouds -- most notably those adopted to describe matter at the atomic
+scale -- are unable to distinguish between special arrangements of particles.
+This makes it impossible to machine learn their properties. Frameworks that are
+provably complete exist but are only so in the limit in which they
+simultaneously describe the mutual relationship between all atoms, which is
+impractical. We present a novel approach to construct descriptors of finite
+correlations based on the relative arrangement of particle triplets, which can
+be employed to create symmetry-adapted models with universal approximation
+capabilities. Our strategy is demonstrated on a class of atomic arrangements
+that are specifically built to defy a broad class of conventional symmetric
+descriptors, showcasing its potential for addressing their limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Machine Learning and <span class="highlight-title">Transformer</span>-based Approaches for
+  Deceptive Text Classification: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deceptive text classification is a critical task in natural language
+processing that aims to identify deceptive o fraudulent content. This study
+presents a comparative analysis of machine learning and transformer-based
+approaches for deceptive text classification. We investigate the effectiveness
+of traditional machine learning algorithms and state-of-the-art transformer
+models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive
+text. A labeled dataset consisting of deceptive and non-deceptive texts is used
+for training and evaluation purposes. Through extensive experimentation, we
+compare the performance metrics, including accuracy, precision, recall, and F1
+score, of the different approaches. The results of this study shed light on the
+strengths and limitations of machine learning and transformer-based methods for
+deceptive text classification, enabling researchers and practitioners to make
+informed decisions when dealing with deceptive content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trained <span class="highlight-title">Transformer</span>s Learn Linear Models In-Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09927v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09927v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Zhang, Spencer Frei, Peter L. Bartlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention-based neural networks such as transformers have demonstrated a
+remarkable ability to exhibit in-context learning (ICL): Given a short prompt
+sequence of tokens from an unseen task, they can formulate relevant per-token
+and next-token predictions without any parameter updates. By embedding a
+sequence of labeled training data and unlabeled test data as a prompt, this
+allows for transformers to behave like supervised learning algorithms. Indeed,
+recent work has shown that when training transformer architectures over random
+instances of linear regression problems, these models' predictions mimic those
+of ordinary least squares.
+  Towards understanding the mechanisms underlying this phenomenon, we
+investigate the dynamics of ICL in transformers with a single linear
+self-attention layer trained by gradient flow on linear regression tasks. We
+show that despite non-convexity, gradient flow with a suitable random
+initialization finds a global minimum of the objective function. At this global
+minimum, when given a test prompt of labeled examples from a new prediction
+task, the transformer achieves prediction error competitive with the best
+linear predictor over the test prompt distribution. We additionally
+characterize the robustness of the trained transformer to a variety of
+distribution shifts and show that although a number of shifts are tolerated,
+shifts in the covariate distribution of the prompts are not. Motivated by this,
+we consider a generalized ICL setting where the covariate distributions can
+vary across prompts. We show that although gradient flow succeeds at finding a
+global minimum in this setting, the trained transformer is still brittle under
+mild covariate shifts. We complement this finding with experiments on large,
+nonlinear transformer architectures which we show are more robust under
+covariate shifts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, experiments added, reference added, typo corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Precise High-Dimensional Asymptotics for Quantifying Heterogeneous
+  Transfers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.11750v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.11750v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang, Hongyang R. Zhang, Sen Wu, Christopher Ré, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of learning one task with samples from another task has received
+much interest recently. In this paper, we ask a fundamental question: when is
+combining data from two tasks better than learning one task alone? Intuitively,
+the transfer effect from one task to another task depends on dataset shifts
+such as sample sizes and covariance matrices. However, quantifying such a
+transfer effect is challenging since we need to compare the risks between joint
+learning and single-task learning, and the comparative advantage of one over
+the other depends on the exact kind of dataset shift between both tasks. This
+paper uses random matrix theory to tackle this challenge in a linear regression
+setting with two tasks. We give precise asymptotics about the excess risks of
+some commonly used estimators in the high-dimensional regime, when the sample
+sizes increase proportionally with the feature dimension at fixed ratios. The
+precise asymptotics is provided as a function of the sample sizes and
+covariate/model shifts, which can be used to study transfer effects: In a
+random-effects model, we give conditions to determine positive and negative
+transfers between learning two tasks versus single-task learning; the
+conditions reveal intricate relations between dataset shifts and transfer
+effects. Simulations justify the validity of the asymptotics in finite
+dimensions. Our analysis examines several functions of two different sample
+covariance matrices, revealing some estimates that generalize classical results
+in the random matrix theory literature, which may be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>64 pages, 6 figures; We thoroughly revised the paper by adding new
+  results and reorganizing the presentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Homophily-enhanced Structure Learning for Graph Clustering <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Gu, Gaoming Yang, Sheng Zhou, Ning Ma, Jiawei Chen, Qiaoyu Tan, Meihan Liu, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph clustering is a fundamental task in graph analysis, and recent advances
+in utilizing graph neural networks (GNNs) have shown impressive results.
+Despite the success of existing GNN-based graph clustering methods, they often
+overlook the quality of graph structure, which is inherent in real-world graphs
+due to their sparse and multifarious nature, leading to subpar performance.
+Graph structure learning allows refining the input graph by adding missing
+links and removing spurious connections. However, previous endeavors in graph
+structure learning have predominantly centered around supervised settings, and
+cannot be directly applied to our specific clustering tasks due to the absence
+of ground-truth labels. To bridge the gap, we propose a novel method called
+\textbf{ho}mophily-enhanced structure \textbf{le}arning for graph clustering
+(HoLe). Our motivation stems from the observation that subtly enhancing the
+degree of homophily within the graph structure can significantly improve GNNs
+and clustering outcomes. To realize this objective, we develop two
+clustering-oriented structure learning modules, i.e., hierarchical correlation
+estimation and cluster-aware sparsification. The former module enables a more
+accurate estimation of pairwise node relationships by leveraging guidance from
+latent and clustering spaces, while the latter one generates a sparsified
+structure based on the similarity matrix and clustering assignments.
+Additionally, we devise a joint optimization approach alternating between
+training the homophily-enhanced structure learning and GNN-based clustering,
+thereby enforcing their reciprocal effects. Extensive experiments on seven
+benchmark datasets of various types and scales, across a range of clustering
+metrics, demonstrate the superiority of HoLe against state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages with 7 figures. Accepted by CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Law of Data Separation in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangfeng He, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep learning has enabled significant advances in many areas of
+science, its black-box nature hinders architecture design for future artificial
+intelligence applications and interpretation for high-stakes decision makings.
+We addressed this issue by studying the fundamental question of how deep neural
+networks process data in the intermediate layers. Our finding is a simple and
+quantitative law that governs how deep neural networks separate data according
+to class membership throughout all layers for classification. This law shows
+that each layer improves data separation at a constant geometric rate, and its
+emergence is observed in a collection of network architectures and datasets
+during training. This law offers practical guidelines for designing
+architectures, improving model robustness and out-of-sample performance, as
+well as interpreting the predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at PNAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Training Challenges in Generative Adversarial Networks for
+  Biomedical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.07646v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.07646v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Muneeb Saad, Ruairi O'Reilly, Mubashir Husain Rehmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In biomedical image analysis, the applicability of deep learning methods is
+directly impacted by the quantity of image data available. This is due to deep
+learning models requiring large image datasets to provide high-level
+performance. Generative Adversarial Networks (GANs) have been widely utilized
+to address data limitations through the generation of synthetic biomedical
+images. GANs consist of two models. The generator, a model that learns how to
+produce synthetic images based on the feedback it receives. The discriminator,
+a model that classifies an image as synthetic or real and provides feedback to
+the generator. Throughout the training process, a GAN can experience several
+technical challenges that impede the generation of suitable synthetic imagery.
+First, the mode collapse problem whereby the generator either produces an
+identical image or produces a uniform image from distinct input features.
+Second, the non-convergence problem whereby the gradient descent optimizer
+fails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem
+whereby unstable training behavior occurs due to the discriminator achieving
+optimal classification performance resulting in no meaningful feedback being
+provided to the generator. These problems result in the production of synthetic
+imagery that is blurry, unrealistic, and less diverse. To date, there has been
+no survey article outlining the impact of these technical challenges in the
+context of the biomedical imagery domain. This work presents a review and
+taxonomy based on solutions to the training problems of GANs in the biomedical
+imaging domain. This survey highlights important challenges and outlines future
+research directions about the training of GANs in the domain of biomedical
+imagery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the AI Review Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Visual Spatial Integration and Recursive Attention for Robust
+  Sound Source Localization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sung Jin Um, Dongjin Kim, Jung Uk Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of the sound source localization task is to enable machines to
+detect the location of sound-making objects within a visual scene. While the
+audio modality provides spatial cues to locate the sound source, existing
+approaches only use audio as an auxiliary role to compare spatial regions of
+the visual modality. Humans, on the other hand, utilize both audio and visual
+modalities as spatial cues to locate sound sources. In this paper, we propose
+an audio-visual spatial integration network that integrates spatial cues from
+both modalities to mimic human behavior when detecting sound-making objects.
+Additionally, we introduce a recursive attention network to mimic human
+behavior of iterative focusing on objects, resulting in more accurate attention
+regions. To effectively encode spatial information from both modalities, we
+propose audio-visual pair matching loss and spatial region alignment loss. By
+utilizing the spatial cues of audio-visual modalities and recursively focusing
+objects, our method can perform more robust sound source localization.
+Comprehensive experimental results on the Flickr SoundNet and VGG-Sound Source
+datasets demonstrate the superiority of our proposed method over existing
+approaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-Ready, ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD
+  Space <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Wang, Haozhe Wu, Junliang Xing, Jia Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating realistic 3D facial animation is crucial for various applications in
+the movie production and gaming industry, especially with the burgeoning demand
+in the metaverse. However, prevalent methods such as blendshape-based
+approaches and facial rigging techniques are time-consuming, labor-intensive,
+and lack standardized configurations, making facial animation production
+challenging and costly. In this paper, we propose a novel self-supervised
+framework, Versatile Face Animator, which combines facial motion capture with
+motion retargeting in an end-to-end manner, eliminating the need for
+blendshapes or rigs. Our method has the following two main characteristics: 1)
+we propose an RGBD animation module to learn facial motion from raw RGBD videos
+by hierarchical motion dictionaries and animate RGBD images rendered from 3D
+facial mesh coarse-to-fine, enabling facial animation on arbitrary 3D
+characters regardless of their topology, textures, blendshapes, and rigs; and
+2) we introduce a mesh retarget module to utilize RGBD animation to create 3D
+facial animation by manipulating facial mesh with controller transformations,
+which are estimated from dense optical flow fields and blended together with
+geodesic-distance-based weights. Comprehensive experiments demonstrate the
+effectiveness of our proposed framework in generating impressive 3D facial
+animation results, highlighting its potential as a promising solution for the
+cost-effective and efficient production of facial animation in the metaverse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViGT: Proposal-free Video Grounding with Learnable Token in <span class="highlight-title">Transformer</span> <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Li, Dan Guo, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The video grounding (VG) task aims to locate the queried action or event in
+an untrimmed video based on rich linguistic descriptions. Existing
+proposal-free methods are trapped in complex interaction between video and
+query, overemphasizing cross-modal feature fusion and feature correlation for
+VG. In this paper, we propose a novel boundary regression paradigm that
+performs regression token learning in a transformer. Particularly, we present a
+simple but effective proposal-free framework, namely Video Grounding
+Transformer (ViGT), which predicts the temporal boundary using a learnable
+regression token rather than multi-modal or cross-modal features. In ViGT, the
+benefits of a learnable token are manifested as follows. (1) The token is
+unrelated to the video or the query and avoids data bias toward the original
+video and query. (2) The token simultaneously performs global context
+aggregation from video and query features. First, we employed a sharing feature
+encoder to project both video and query into a joint feature space before
+performing cross-modal co-attention (i.e., video-to-query attention and
+query-to-video attention) to highlight discriminative features in each
+modality. Furthermore, we concatenated a learnable regression token [REG] with
+the video and query features as the input of a vision-language transformer.
+Finally, we utilized the token [REG] to predict the target moment and visual
+features to constrain the foreground and background probabilities at each
+timestamp. The proposed ViGT performed well on three public datasets: ANet
+Captions, TACoS and YouCookII. Extensive ablation studies and qualitative
+analysis further validated the interpretability of ViGT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by SCIENCE CHINA Information Sciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio is all in one: speech-driven gesture synthetics using WavLM
+  <span class="highlight-title">pre-train</span>ed model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Zhang, Naye Ji, Fuxing Gao, Siyuan Zhao, Zhaohan Wang, Shunman Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of co-speech gestures for digital humans is an emerging area
+in the field of virtual human creation. Prior research has made progress by
+using acoustic and semantic information as input and adopting classify method
+to identify the person's ID and emotion for driving co-speech gesture
+generation. However, this endeavour still faces significant challenges. These
+challenges go beyond the intricate interplay between co-speech gestures, speech
+acoustic, and semantics; they also encompass the complexities associated with
+personality, emotion, and other obscure but important factors. This paper
+introduces "diffmotion-v2," a speech-conditional diffusion-based and
+non-autoregressive transformer-based generative model with WavLM pre-trained
+model. It can produce individual and stylized full-body co-speech gestures only
+using raw speech audio, eliminating the need for complex multimodal processing
+and manually annotated. Firstly, considering that speech audio not only
+contains acoustic and semantic features but also conveys personality traits,
+emotions, and more subtle information related to accompanying gestures, we
+pioneer the adaptation of WavLM, a large-scale pre-trained model, to extract
+low-level and high-level audio information. Secondly, we introduce an adaptive
+layer norm architecture in the transformer-based layer to learn the
+relationship between speech information and accompanying gestures. Extensive
+subjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT
+datasets to confirm the WavLM and the model's ability to synthesize natural
+co-speech gestures with various styles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantics2Hands: Transferring Hand Motion Semantics between Avatars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Ye, Jia Jia, Junliang Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human hands, the primary means of non-verbal communication, convey intricate
+semantics in various scenarios. Due to the high sensitivity of individuals to
+hand motions, even minor errors in hand motions can significantly impact the
+user experience. Real applications often involve multiple avatars with varying
+hand shapes, highlighting the importance of maintaining the intricate semantics
+of hand motions across the avatars. Therefore, this paper aims to transfer the
+hand motion semantics between diverse avatars based on their respective hand
+models. To address this problem, we introduce a novel anatomy-based semantic
+matrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the
+positions of the palm and other joints relative to the local frame of the
+corresponding joint, enabling precise retargeting of hand motions.
+Subsequently, we obtain a mapping function from the source ASM to the target
+hand joint rotations by employing an anatomy-based semantics reconstruction
+network (ASRN). We train the ASRN using a semi-supervised learning strategy on
+the Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain
+and cross-domain hand motion retargeting tasks. The qualitative and
+quantitative results demonstrate the significant superiority of our ASRN over
+the state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MM 2023, 9 pages, 10 figures. Project page:
+  https://abcyzj.github.io/S2H/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Diverse Data Types Steganalysis: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar, Mustapha Hemis, Yassine Himeur, David Megías, Abbes Amira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography and steganalysis are two interrelated aspects of the field of
+information security. Steganography seeks to conceal communications, whereas
+steganalysis is aimed to either find them or even, if possible, recover the
+data they contain. Steganography and steganalysis have attracted a great deal
+of interest, particularly from law enforcement. Steganography is often used by
+cybercriminals and even terrorists to avoid being captured while in possession
+of incriminating evidence, even encrypted, since cryptography is prohibited or
+restricted in many countries. Therefore, knowledge of cutting-edge techniques
+to uncover concealed information is crucial in exposing illegal acts. Over the
+last few years, a number of strong and reliable steganography and steganalysis
+techniques have been introduced in the literature. This review paper provides a
+comprehensive overview of deep learning-based steganalysis techniques used to
+detect hidden information within digital media. The paper covers all types of
+cover in steganalysis, including image, audio, and video, and discusses the
+most commonly used deep learning techniques. In addition, the paper explores
+the use of more advanced deep learning techniques, such as deep transfer
+learning (DTL) and deep reinforcement learning (DRL), to enhance the
+performance of steganalysis systems. The paper provides a systematic review of
+recent research in the field, including data sets and evaluation metrics used
+in recent studies. It also presents a detailed analysis of DTL-based
+steganalysis approaches and their performance on different data sets. The
+review concludes with a discussion on the current state of deep learning-based
+steganalysis, challenges, and future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale
+  Multi-Attribute and Language Search Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02898v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02898v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyu Yang, Yinan Zhou, Yaxiong Wang, Yujiao Wu, Li Zhu, Zhedong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a large Multi-Attribute and Language Search
+dataset for text-based person retrieval, called MALS, and explore the
+feasibility of performing pre-training on both attribute recognition and
+image-text matching tasks in one stone. In particular, MALS contains 1,510,330
+image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,
+and all images are annotated with 27 attributes. Considering the privacy
+concerns and annotation costs, we leverage the off-the-shelf diffusion models
+to generate the dataset. To verify the feasibility of learning from the
+generated data, we develop a new joint Attribute Prompt Learning and Text
+Matching Learning (APTM) framework, considering the shared knowledge between
+attribute and text. As the name implies, APTM contains an attribute prompt
+learning stream and a text matching learning stream. (1) The attribute prompt
+learning leverages the attribute prompts for image-attribute alignment, which
+enhances the text matching learning. (2) The text matching learning facilitates
+the representation learning on fine-grained details, and in turn, boosts the
+attribute prompt learning. Extensive experiments validate the effectiveness of
+the pre-training on MALS, achieving state-of-the-art retrieval performance via
+APTM on three challenging real-world benchmarks. In particular, APTM achieves a
+consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on
+CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Multimedia Event Extraction With Generated Images and Captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilin Du, Yunxin Li, Xu Guo, Yidan Sun, Boyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary news reporting increasingly features multimedia content,
+motivating research on multimedia event extraction. However, the task lacks
+annotated multimodal training data and artificially generated training data
+suffer from distribution shift from real-world data. In this paper, we propose
+Cross-modality Augmented Multimedia Event Learning (CAMEL), which successfully
+utilizes artificially generated multimodal training data and achieves
+state-of-the-art performance. We start with two labeled unimodal datasets in
+text and image respectively, and generate the missing modality using
+off-the-shelf image generators like Stable Diffusion and image captioners like
+BLIP. After that, we train the network on the resultant multimodal datasets. In
+order to learn robust features that are effective across domains, we devise an
+iterative and gradual training strategy. Substantial experiments show that
+CAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On
+multimedia events in particular, we outperform the prior SOTA by 4.2% F1 on
+event mention identification and by 9.8% F1 on argument identification, which
+indicates that CAMEL learns synergistic representations from the two
+modalities. Our work demonstrates a recipe to unleash the power of synthetic
+training data in structured prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point
+  Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09695v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09695v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sauradip Nag, Anran Qi, Xiatian Zhu, Ariel Shamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Garment pattern design aims to convert a 3D garment to the corresponding 2D
+panels and their sewing structure. Existing methods rely either on template
+fitting with heuristics and prior assumptions, or on model learning with
+complicated shape parameterization. Importantly, both approaches do not allow
+for personalization of the output garment, which today has increasing demands.
+To fill this demand, we introduce PersonalTailor: a personalized 2D pattern
+design method, where the user can input specific constraints or demands (in
+language or sketch) for personal 2D panel fabrication from 3D point clouds.
+PersonalTailor first learns a multi-modal panel embeddings based on
+unsupervised cross-modal association and attentive fusion. It then predicts a
+binary panel masks individually using a transformer encoder-decoder framework.
+Extensive experiments show that our PersonalTailor excels on both personalized
+and standard pattern fabrication tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Closer Look into Recent Video-based Learning Research: A Comprehensive
+  <span class="highlight-title">Review</span> of Video Characteristics, Tools, Technologies, and Learning
+  Effectiveness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evelyn Navarrete, Andreas Nehring, Sascha Schanze, Ralph Ewerth, Anett Hoppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  People increasingly use videos on the Web as a source for learning. To
+support this way of learning, researchers and developers are continuously
+developing tools, proposing guidelines, analyzing data, and conducting
+experiments. However, it is still not clear what characteristics a video should
+have to be an effective learning medium. In this paper, we present a
+comprehensive review of 257 articles on video-based learning for the period
+from 2016 to 2021. One of the aims of the review is to identify the video
+characteristics that have been explored by previous work. Based on our
+analysis, we suggest a taxonomy which organizes the video characteristics and
+contextual aspects into eight categories: (1) audio features, (2) visual
+features, (3) textual features, (4) instructor behavior, (5) learners
+activities, (6) interactive features (quizzes, etc.), (7) production style, and
+(8) instructional design. Also, we identify four representative research
+directions: (1) proposals of tools to support video-based learning, (2) studies
+with controlled experiments, (3) data analysis studies, and (4) proposals of
+design guidelines for learning videos. We find that the most explored
+characteristics are textual features followed by visual features, learner
+activities, and interactive features. Text of transcripts, video frames, and
+images (figures and illustrations) are most frequently used by tools that
+support learning through videos. The learner activity is heavily explored
+through log files in data analysis studies, and interactive features have been
+frequently scrutinized in controlled experiments. We complement our review by
+contrasting research findings that investigate the impact of video
+characteristics on the learning effectiveness, report on tasks and technologies
+used to develop tools that support learning, and summarize trends of design
+guidelines to produce learning videos
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-10T00:00:00Z">2023-08-10</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">33</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech
+  Resynthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tu Anh Nguyen, Wei-Ning Hsu, Antony D'Avirro, Bowen Shi, Itai Gat, Maryam Fazel-Zarani, Tal Remez, Jade Copet, Gabriel Synnaeve, Michael Hassid, Felix Kreuk, Yossi Adi, Emmanuel Dupoux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that it is possible to resynthesize high-quality speech
+based, not on text, but on low bitrate discrete units that have been learned in
+a self-supervised fashion and can therefore capture expressive aspects of
+speech that are hard to transcribe (prosody, voice styles, non-verbal
+vocalization). The adoption of these methods is still limited by the fact that
+most speech synthesis datasets are read, severely limiting spontaneity and
+expressivity. Here, we introduce Expresso, a high-quality expressive speech
+dataset for textless speech synthesis that includes both read speech and
+improvised dialogues rendered in 26 spontaneous expressive styles. We
+illustrate the challenges and potentials of this dataset with an expressive
+resynthesis benchmark where the task is to encode the input in low-bitrate
+units and resynthesize it in a target voice while preserving content and style.
+We evaluate resynthesis quality with automatic metrics for different
+self-supervised discrete encoders, and explore tradeoffs between quality,
+bitrate and invariance to speaker and style. All the dataset, evaluation
+metrics and baseline models are open source
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Preliminary Study of the Intrinsic Relationship between Complexity and
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingxiu Zhao, Bowen Yu, Binyuan Hui, Haiyang Yu, Fei Huang, Yongbin Li, Nevin L. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training large language models (LLMs) with open-domain instruction data has
+yielded remarkable success in aligning to end tasks and user preferences.
+Extensive research has highlighted that enhancing the quality and diversity of
+instruction data consistently improves performance. However, the impact of data
+complexity, as a crucial metric, remains relatively unexplored in three
+aspects: (1) scaling law, where the sustainability of performance improvements
+with increasing complexity is uncertain, (2) additional tokens, whether the
+improvement brought by complexity comes from introducing more training tokens,
+and (3) curriculum tuning, where the potential advantages of incorporating
+instructions ranging from easy to difficult are not yet fully understood. In
+this paper, we propose \textit{tree-instruct} to systematically enhance the
+complexity of instruction data in a controllable manner. This approach adds a
+specified number of nodes into the instruction semantic tree, yielding new
+instruction data based on the modified tree. By adjusting the number of added
+nodes, we can control the difficulty level in the modified instruction data.
+Our preliminary experiments reveal the following insights: (1) Increasing
+complexity consistently leads to sustained performance improvements. For
+instance, using 1,000 instruction data and 10 nodes resulted in a substantial
+24\% increase in win rate. (2) Under the same token budget, a few complex
+instructions outperform diverse yet simple instructions. (3) Curriculum
+instruction tuning might not yield the anticipated results; focusing on
+increasing complexity appears to be the key.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling
+  Cross-Lingual, Cross-<span class="highlight-title">Dataset</span> and Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iknoor Singh, Carolina Scarton, Xingyi Song, Kalina Bontcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of retrieving already debunked narratives aims to detect stories
+that have already been fact-checked. The successful detection of claims that
+have already been debunked not only reduces the manual efforts of professional
+fact-checkers but can also contribute to slowing the spread of misinformation.
+Mainly due to the lack of readily available data, this is an understudied
+problem, particularly when considering the cross-lingual task, i.e. the
+retrieval of fact-checking articles in a language different from the language
+of the online post being checked. This paper fills this gap by (i) creating a
+novel dataset to enable research on cross-lingual retrieval of already debunked
+narratives, using tweets as queries to a database of fact-checking articles;
+(ii) presenting an extensive experiment to benchmark fine-tuned and
+off-the-shelf multilingual pre-trained Transformer models for this task; and
+(iii) proposing a novel multistage framework that divides this cross-lingual
+debunk retrieval task into refinement and re-ranking stages. Results show that
+the task of cross-lingual retrieval of already debunked narratives is
+challenging and off-the-shelf Transformer models fail to outperform a strong
+lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework
+is robust, outperforming BM25 in most scenarios and enabling cross-domain and
+zero-shot learning, without significantly harming the model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AST-MHSA : Code Summarization using Multi-Head Self-Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeshwanth Nagaraj, Ujjwal Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code summarization aims to generate concise natural language descriptions for
+source code. The prevailing approaches adopt transformer-based encoder-decoder
+architectures, where the Abstract Syntax Tree (AST) of the source code is
+utilized for encoding structural information. However, ASTs are much longer
+than the corresponding source code, and existing methods ignore this size
+constraint by directly feeding the entire linearized AST into the encoders.
+This simplistic approach makes it challenging to extract truly valuable
+dependency relations from the overlong input sequence and leads to significant
+computational overhead due to self-attention applied to all nodes in the AST.
+  To address this issue effectively and efficiently, we present a model,
+AST-MHSA that uses multi-head attention to extract the important semantic
+information from the AST. The model consists of two main components: an encoder
+and a decoder. The encoder takes as input the abstract syntax tree (AST) of the
+code and generates a sequence of hidden states. The decoder then takes these
+hidden states as input and generates a natural language summary of the code.
+  The multi-head attention mechanism allows the model to learn different
+representations of the input code, which can be combined to generate a more
+comprehensive summary. The model is trained on a dataset of code and summaries,
+and the parameters of the model are optimized to minimize the loss between the
+generated summaries and the ground-truth summaries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqiang Fan, Xiaohao Cai, Mahesan Niranjan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated medical report generation has become increasingly important in
+medical analysis. It can produce computer-aided diagnosis descriptions and thus
+significantly alleviate the doctors' work. Inspired by the huge success of
+neural machine translation and image captioning, various deep learning methods
+have been proposed for medical report generation. However, due to the inherent
+properties of medical data, including data imbalance and the length and
+correlation between report sequences, the generated reports by existing methods
+may exhibit linguistic fluency but lack adequate clinical accuracy. In this
+work, we propose an image-to-indicator hierarchical transformer (IIHT)
+framework for medical report generation. It consists of three modules, i.e., a
+classifier module, an indicator expansion module and a generator module. The
+classifier module first extracts image features from the input medical images
+and produces disease-related indicators with their corresponding states. The
+disease-related indicators are subsequently utilised as input for the indicator
+expansion module, incorporating the "data-text-data" strategy. The
+transformer-based generator then leverages these extracted features along with
+image features as auxiliary information to generate final reports. Furthermore,
+the proposed IIHT method is feasible for radiologists to modify disease
+indicators in real-world scenarios and integrate the operations into the
+indicator expansion module for fluent and accurate medical report generation.
+Extensive experiments and comparisons with state-of-the-art methods under
+various evaluation metrics demonstrate the great performance of the proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Ruas, Diana F. Sousa, André Neves, Carlos Cruz, Francisco M. Couto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical Natural Language Processing (NLP) tends to become cumbersome for
+most researchers, frequently due to the amount and heterogeneity of text to be
+processed. To address this challenge, the industry is continuously developing
+highly efficient tools and creating more flexible engineering solutions. This
+work presents the integration between industry data engineering solutions for
+efficient data processing and academic systems developed for Named Entity
+Recognition (LasigeUnicage\_NER) and Relation Extraction (BiOnt). Our design
+reflects an integration of those components with external knowledge in the form
+of additional training data from other datasets and biomedical ontologies. We
+used this pipeline in the 2022 LitCoin NLP Challenge, where our team
+LasigeUnicage was awarded the 7th Prize out of approximately 200 participating
+teams, reflecting a successful collaboration between the academia (LASIGE) and
+the industry (Unicage). The software supporting this work is available at
+\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ You Only <span class="highlight-title">Prompt</span> Once: On the Capabilities of <span class="highlight-title">Prompt</span> Learning on Large
+  Language Models to Tackle Toxic Content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinlei He, Savvas Zannettou, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The spread of toxic content online is an important problem that has adverse
+effects on user experience online and in our society at large. Motivated by the
+importance and impact of the problem, research focuses on developing solutions
+to detect toxic content, usually leveraging machine learning (ML) models
+trained on human-annotated datasets. While these efforts are important, these
+models usually do not generalize well and they can not cope with new trends
+(e.g., the emergence of new toxic terms). Currently, we are witnessing a shift
+in the approach to tackling societal issues online, particularly leveraging
+large language models (LLMs) like GPT-3 or T5 that are trained on vast corpora
+and have strong generalizability. In this work, we investigate how we can use
+LLMs and prompt learning to tackle the problem of toxic content, particularly
+focusing on three tasks; 1) Toxicity Classification, 2) Toxic Span Detection,
+and 3) Detoxification. We perform an extensive evaluation over five model
+architectures and eight datasets demonstrating that LLMs with prompt learning
+can achieve similar or even better performance compared to models trained on
+these specific tasks. We find that prompt learning achieves around 10\%
+improvement in the toxicity classification task compared to the baselines,
+while for the toxic span detection task we find better performance to the best
+baseline (0.643 vs. 0.640 in terms of $F_1$-score). Finally, for the
+detoxification task, we find that prompt learning can successfully reduce the
+average toxicity score (from 0.775 to 0.213) while preserving semantic meaning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear in the 45th IEEE Symposium on Security and Privacy, May
+  20-23, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Language Models Refer? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Mandelkern, Tal Linzen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What do language models (LMs) do with language? Everyone agrees that they
+produce sequences of (mostly) coherent sentences. But are they saying anything
+with those strings or simply babbling in a convincing simulacrum of language
+use? This is a vague question, and there are many ways of making it precise.
+Here we will address one aspect of the question, namely, whether LMs' words
+refer: that is, whether the outputs of LMs achieve "word-to-world" connections.
+There is prima facie reason to think they do not since LMs do not interact with
+the world in the way that ordinary language users do. Drawing on insights from
+the externalist tradition in philosophy of language, we argue that appearances
+are misleading and that there is good reason to think that LMs can refer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual
+  Translation of Dravidian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danish Ebadulla, Rahul Raman, S. Natarajan, Hridhay Kiran Shetty, Ashish Harish Shenoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research in zero-shot translation is plagued by several issues such
+as high compute requirements, increased training time and off target
+translations. Proposed remedies often come at the cost of additional data or
+compute requirements. Pivot based neural machine translation is preferred over
+a single-encoder model for most settings despite the increased training and
+evaluation time. In this work, we overcome the shortcomings of zero-shot
+translation by taking advantage of transliteration and linguistic similarity.
+We build a single encoder-decoder neural machine translation system for
+Dravidian-Dravidian multilingual translation and perform zero-shot translation.
+We compare the data vs zero-shot accuracy tradeoff and evaluate the performance
+of our vanilla method against the current state of the art pivot based method.
+We also test the theory that morphologically rich languages require large
+vocabularies by restricting the vocabulary using an optimal transport based
+technique. Our model manages to achieves scores within 3 BLEU of large-scale
+pivot-based models when it is trained on 50\% of the language directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bringing order into the realm of <span class="highlight-title">Transformer</span>-based language models for
+  artificial intelligence and law 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Candida M. Greco, Andrea Tagarelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based language models (TLMs) have widely been recognized to be a
+cutting-edge technology for the successful development of deep-learning-based
+solutions to problems and applications that require natural language processing
+and understanding. Like for other textual domains, TLMs have indeed pushed the
+state-of-the-art of AI approaches for many tasks of interest in the legal
+domain. Despite the first Transformer model being proposed about six years ago,
+there has been a rapid progress of this technology at an unprecedented rate,
+whereby BERT and related models represent a major reference, also in the legal
+domain. This article provides the first systematic overview of TLM-based
+methods for AI-driven problems and tasks in the legal sphere. A major goal is
+to highlight research advances in this field so as to understand, on the one
+hand, how the Transformers have contributed to the success of AI in supporting
+legal processes, and on the other hand, what are the current limitations and
+opportunities for further research development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication with Artificial Intelligence and Law,
+  Springer Nature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM As DBA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Guoliang Li, Zhiyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Database administrators (DBAs) play a crucial role in managing, maintaining
+and optimizing a database system to ensure data availability, performance, and
+reliability. However, it is hard and tedious for DBAs to manage a large number
+of database instances (e.g., millions of instances on the cloud databases).
+Recently large language models (LLMs) have shown great potential to understand
+valuable documents and accordingly generate reasonable answers. Thus, we
+propose D-Bot, a LLM-based database administrator that can continuously acquire
+database maintenance experience from textual sources, and provide reasonable,
+well-founded, in-time diagnosis and optimization advice for target databases.
+This paper presents a revolutionary LLM-centric framework for database
+maintenance, including (i) database maintenance knowledge detection from
+documents and tools, (ii) tree of thought reasoning for root cause analysis,
+and (iii) collaborative diagnosis among multiple LLMs. Our preliminary
+experimental results that D-Bot can efficiently and effectively diagnose the
+root causes and our code is available at
+github.com/TsinghuaDatabaseGroup/DB-GPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Machine Learning and <span class="highlight-title">Transformer</span>-based Approaches for
+  Deceptive Text Classification: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deceptive text classification is a critical task in natural language
+processing that aims to identify deceptive or fraudulent content. This study
+presents a comparative analysis of machine learning and transformer-based
+approaches for deceptive text classification. We investigate the effectiveness
+of traditional machine learning algorithms and state-of-the-art transformer
+models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive
+text. A labeled dataset consisting of deceptive and non-deceptive texts is used
+for training and evaluation purposes. Through extensive experimentation, we
+compare the performance metrics, including accuracy, precision, recall, and F1
+score, of the different approaches. The results of this study shed light on the
+strengths and limitations of machine learning and transformer-based methods for
+deceptive text classification, enabling researchers and practitioners to make
+informed decisions when dealing with deceptive content
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WeaverBird: Empowering Financial Decision-Making with Large Language
+  Model, Knowledge Base, and Search Engine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqiao Xue, Fan Zhou, Yi Xu, Hongyu Zhao, Shuo Xie, Caigao Jiang, James Zhang, Jun Zhou, Peng Xu, Dacheng Xiu, Hongyuan Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present WeaverBird, an intelligent dialogue system designed specifically
+for the finance domain. Our system harnesses a large language model of GPT
+architecture that has been tuned using extensive corpora of finance-related
+text. As a result, our system possesses the capability to understand complex
+financial queries, such as "How should I manage my investments during
+inflation?", and provide informed responses. Furthermore, our system
+incorporates a local knowledge base and a search engine to retrieve relevant
+information. The final responses are conditioned on the search results and
+include proper citations to the sources, thus enjoying an enhanced credibility.
+Through a range of finance-related questions, we have demonstrated the superior
+performance of our system compared to other models. To experience our system
+firsthand, users can interact with our live demo at
+https://weaverbird.ttic.edu, as well as watch our 2-min video illustration at
+https://www.youtube.com/watch?v=yofgeqnlrMc.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metacognitive <span class="highlight-title">Prompt</span>ing Improves Understanding in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqing Wang, Yun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Large Language Models (LLMs), there have been consistent advancements in
+task-specific performance, largely influenced by effective prompt design. While
+recent research on prompting has enhanced the reasoning capabilities of LLMs, a
+gap remains in further improving their understanding abilities. In this study,
+we introduce metacognitive prompting (MP), a strategy inspired by human
+introspective reasoning processes. Using MP, LLMs undergo a systematic series
+of structured, self-aware evaluations, drawing on both their vast inherent
+knowledge and new insights. Our experiments involve five prevalent LLMs:
+Llama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general
+natural language understanding (NLU) tasks from the GLUE and SuperGLUE
+benchmarks. Results indicate that, although GPT-4 consistently excels in most
+tasks, PaLM, when equipped with MP, approaches its performance level.
+Furthermore, across models and datasets, MP consistently outperforms existing
+prompting methods, including standard and chain-of-thought prompting. This
+study underscores the potential to amplify the understanding abilities of LLMs
+and highlights the benefits of mirroring human introspective reasoning in NLU
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, in submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Human- and AI-Generated Texts: Investigating Features
+  for Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenz Mindner, Tim Schlippe, Kristina Schaaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, generative AIs like ChatGPT have become available to the wide
+public. These tools can for instance be used by students to generate essays or
+whole theses. But how does a teacher know whether a text is written by a
+student or an AI? In our work, we explore traditional and new features to (1)
+detect text generated by AI from scratch and (2) text rephrased by AI. Since we
+found that classification is more difficult when the AI has been instructed to
+create the text in a way that a human would not recognize that it was generated
+by an AI, we also investigate this more advanced case. For our experiments, we
+produced a new text corpus covering 10 school topics. Our best systems to
+classify basic and advanced human-generated/AI-generated texts have F1-scores
+of over 96%. Our best systems for classifying basic and advanced
+human-generated/AI-rephrased texts have F1-scores of more than 78%. The systems
+use a combination of perplexity, semantic, list lookup, error-based,
+readability, AI feedback, and text vector features. Our results show that the
+new features substantially help to improve the performance of many classifiers.
+Our best basic text rephrasing detection system even outperforms GPTZero by
+183.8% relative in F1-score.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing an Informal-Formal Persian Corpus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahide Tajalli, Fateme Kalantari, Mehrnoush Shamsfard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Informal language is a style of spoken or written language frequently used in
+casual conversations, social media, weblogs, emails and text messages. In
+informal writing, the language faces some lexical and/or syntactic changes
+varying among different languages. Persian is one of the languages with many
+differences between its formal and informal styles of writing, thus developing
+informal language processing tools for this language seems necessary. Such a
+converter needs a large aligned parallel corpus of colloquial-formal sentences
+which can be useful for linguists to extract a regulated grammar and
+orthography for colloquial Persian as is done for the formal language. In this
+paper we explain our methodology in building a parallel corpus of 50,000
+sentence pairs with alignments in the word/phrase level. The sentences were
+attempted to cover almost all kinds of lexical and syntactic changes between
+informal and formal Persian, therefore both methods of exploring and collecting
+from the different resources of informal scripts and following the phonological
+and morphological patterns of changes were applied to find as much instances as
+possible. The resulting corpus has about 530,000 alignments and a dictionary
+containing 49,397 word and phrase pairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 1 Figure and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-Shot Data-to-Text Generation via Unified Representation and
+  Multi-Source Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hanbo Li, Mingyue Shang, Evangelia Spiliopoulou, Jie Ma, Patrick Ng, Zhiguo Wang, Bonan Min, William Wang, Kathleen McKeown, Vittorio Castelli, Dan Roth, Bing Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach for structured data-to-text generation that
+addresses the limitations of existing methods that primarily focus on specific
+types of structured data. Our proposed method aims to improve performance in
+multi-task training, zero-shot and few-shot scenarios by providing a unified
+representation that can handle various forms of structured data such as tables,
+knowledge graph triples, and meaning representations. We demonstrate that our
+proposed approach can effectively adapt to new structured forms, and can
+improve performance in comparison to current methods. For example, our method
+resulted in a 66% improvement in zero-shot BLEU scores when transferring models
+trained on table inputs to a knowledge graph dataset. Our proposed method is an
+important step towards a more general data-to-text generation framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating disaster response through social media data and the
+  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.
+  wildfire season 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihui Ma, Lingyao Li, Libby Hemphill, Gregory B. Baecher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective disaster response is critical for affected communities. Responders
+and decision-makers would benefit from reliable, timely measures of the issues
+impacting their communities during a disaster, and social media offers a
+potentially rich data source. Social media can reflect public concerns and
+demands during a disaster, offering valuable insights for decision-makers to
+understand evolving situations and optimize resource allocation. We used
+Bidirectional Encoder Representations from Transformers (BERT) topic modeling
+to cluster topics from Twitter data. Then, we conducted a temporal-spatial
+analysis to examine the distribution of these topics across different regions
+during the 2020 western U.S. wildfire season. Our results show that Twitter
+users mainly focused on three topics:"health impact," "damage," and
+"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to
+explore the magnitude and velocity of topic diffusion on Twitter. The results
+displayed a clear relationship between topic trends and wildfire propagation
+patterns. The estimated parameters obtained from the SIR model in selected
+cities revealed that residents exhibited a high level of several concerns
+during the wildfire. Our study details how the SIR model and topic modeling
+using social media data can provide decision-makers with a quantitative
+approach to measure disaster response and support their decision-making
+processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Self-training Approach for Low-resource Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwinder Singh, Feng Hou, Ruili Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a self-training approach for automatic speech
+recognition (ASR) for low-resource settings. While self-training approaches
+have been extensively developed and evaluated for high-resource languages such
+as English, their applications to low-resource languages like Punjabi have been
+limited, despite the language being spoken by millions globally. The scarcity
+of annotated data has hindered the development of accurate ASR systems,
+especially for low-resource languages (e.g., Punjabi and M\=aori languages). To
+address this issue, we propose an effective self-training approach that
+generates highly accurate pseudo-labels for unlabeled low-resource speech. Our
+experimental analysis demonstrates that our approach significantly improves
+word error rate, achieving a relative improvement of 14.94% compared to a
+baseline model across four real speech datasets. Further, our proposed approach
+reports the best results on the Common Voice Punjabi dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthesizing Mixed-type Electronic Health Records using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Ceritli, Ghadeer O. Ghosheh, Vinod Kumar Chauhan, Tingting Zhu, Andrew P. Creagh, David A. Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic Health Records (EHRs) contain sensitive patient information, which
+presents privacy concerns when sharing such data. Synthetic data generation is
+a promising solution to mitigate these risks, often relying on deep generative
+models such as Generative Adversarial Networks (GANs). However, recent studies
+have shown that diffusion models offer several advantages over GANs, such as
+generation of more realistic synthetic data and stable training in generating
+data modalities, including image, text, and sound. In this work, we investigate
+the potential of diffusion models for generating realistic mixed-type tabular
+EHRs, comparing TabDDPM model with existing methods on four datasets in terms
+of data quality, utility, privacy, and augmentation. Our experiments
+demonstrate that TabDDPM outperforms the state-of-the-art models across all
+evaluation metrics, except for privacy, which confirms the trade-off between
+privacy and utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Page 2, Figure 1 is updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.02399v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.02399v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longtian Qiu, Renrui Zhang, Ziyu Guo, Ziyao Zeng, Zilu Guo, Yafeng Li, Guangnan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention
+recently for its transferable visual representation learning. However, due to
+the semantic gap within datasets, CLIP's pre-trained image-text alignment
+becomes sub-optimal on downstream tasks, which severely harms its transferring
+performance. To better adapt the cross-modality embedding space, we propose to
+enhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide
+textual features of different categories to adaptively explore informative
+regions on the image and aggregate visual features by attention mechanisms. In
+this way, the texts become visual-guided, namely, more semantically correlated
+with downstream images, which greatly benefits the category-wise matching
+process. In few-shot settings, we evaluate our VT-CLIP on 11 well-known
+classification datasets to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-4 Can't Reason 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03762v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03762v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantine Arkoudas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GPT-4 was released in March 2023 to wide acclaim, marking a very substantial
+improvement across the board over GPT-3.5 (OpenAI's previously best model,
+which had powered the initial release of ChatGPT). However, despite the
+genuinely impressive improvement, there are good reasons to be highly skeptical
+of GPT-4's ability to reason. This position paper discusses the nature of
+reasoning; criticizes the current formulation of reasoning problems in the NLP
+community, as well as the way in which LLM reasoning performance is currently
+evaluated; introduces a small collection of 21 diverse reasoning problems; and
+performs a detailed qualitative evaluation of GPT-4's performance on those
+problems. Based on this analysis, the paper concludes that, despite its
+occasional flashes of analytical brilliance, GPT-4 at present is utterly
+incapable of reasoning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extending an Event-type Ontology: Adding Verbs and Classes Using
+  Fine-tuned LLMs Suggestions <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jana Straková, Eva Fučíková, Jan Hajič, Zdeňka Urešová
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this project, we have investigated the use of advanced machine learning
+methods, specifically fine-tuned large language models, for pre-annotating data
+for a lexical extension task, namely adding descriptive words (verbs) to an
+existing (but incomplete, as of yet) ontology of event types. Several research
+questions have been focused on, from the investigation of a possible heuristics
+to provide at least hints to annotators which verbs to include and which are
+outside the current version of the ontology, to the possible use of the
+automatic scores to help the annotators to be more efficient in finding a
+threshold for identifying verbs that cannot be assigned to any existing class
+and therefore they are to be used as seeds for a new class. We have also
+carefully examined the correlation of the automatic scores with the human
+annotation. While the correlation turned out to be strong, its influence on the
+annotation proper is modest due to its near linearity, even though the mere
+fact of such pre-annotation leads to relatively short annotation times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at LAW-XVII @ ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Retrieval to Generation: Efficient and Effective Entity Set
+  Expansion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shulin Huang, Shirong Ma, Yangning Li, Yinghui Li, Hai-Tao Zheng, Yong Jiang, Hong-Gee Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Set Expansion (ESE) is a critical task aiming to expand entities of
+the target semantic class described by a small seed entity set. Most existing
+ESE methods are retrieval-based frameworks that need to extract the contextual
+features of entities and calculate the similarity between seed entities and
+candidate entities. To achieve the two purposes, they should iteratively
+traverse the corpus and the entity vocabulary provided in the datasets,
+resulting in poor efficiency and scalability. The experimental results indicate
+that the time consumed by the retrieval-based ESE methods increases linearly
+with entity vocabulary and corpus size. In this paper, we firstly propose a
+generative ESE framework, Generative Entity Set Expansion (GenExpan), which
+utilizes a generative pre-trained language model to accomplish ESE task.
+Specifically, a prefix tree is employed to guarantee the validity of entity
+generation, and automatically generated class names are adopted to guide the
+model to generate target entities. Moreover, we propose Knowledge Calibration
+and Generative Ranking to further bridge the gap between generic knowledge of
+the language model and the goal of ESE task. Experiments on publicly available
+datasets show that GenExpan is efficient and effective. For efficiency,
+expansion time consumed by GenExpan is independent of entity vocabulary and
+corpus size, and GenExpan achieves an average 600% speedup compared to strong
+baselines. For expansion performance, our framework outperforms previous
+state-of-the-art ESE methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strahler Number of Natural Language Sentences in Comparison with Random
+  Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02697v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02697v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumiko Tanaka-Ishii, Akira Tanaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Strahler number was originally proposed to characterize the complexity of
+river bifurcation and has found various applications. This article proposes
+computation of the Strahler number's upper and lower limits for natural
+language sentence tree structures. Through empirical measurements across
+grammatically annotated data, the Strahler number of natural language sentences
+is shown to be almost 3 or 4, similarly to the case of river bifurcation as
+reported by Strahler (1957). From the theory behind the number, we show that it
+is one kind of lower limit on the amount of memory required to process
+sentences. We consider the Strahler number to provide reasoning that explains
+reports showing that the number of required memory areas to process sentences
+is 3 to 4 for parsing (Abney and Johnson, 1991; Schuler et al., 2010), and
+reports indicating a psychological "magical number" of 3 to 5 (Cowan, 2001). An
+analytical and empirical analysis shows that the Strahler number is not
+constant but grows logarithmically; therefore, the Strahler number of sentences
+derives from the range of sentence lengths. Furthermore, the Strahler number is
+not different for random trees, which could suggest that its origin is not
+specific to natural language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Mastery Benchmark: An Ever-Updating Benchmark for Evaluating
+  Holistic Domain Knowledge of Large Language Model--A Preliminary Release 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhouhong Gu, Xiaoxuan Zhu, Haoning Ye, Lin Zhang, Zhuozhi Xiong, Zihan Li, Qianyu He, Sihang Jiang, Hongwei Feng, Yanghua Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain knowledge refers to the in-depth understanding, expertise, and
+familiarity with a specific subject, industry, field, or area of special
+interest. The existing benchmarks are all lack of an overall design for domain
+knowledge evaluation. Holding the belief that the real ability of domain
+language understanding can only be fairly evaluated by an comprehensive and
+in-depth benchmark, we introduces the Domma, a Domain Mastery Benchmark. DomMa
+targets at testing Large Language Models (LLMs) on their domain knowledge
+understanding, it features extensive domain coverage, large data volume, and a
+continually updated data set based on Chinese 112 first-level subject
+classifications. DomMa consist of 100,000 questions in both Chinese and English
+sourced from graduate entrance examinations and undergraduate exams in Chinese
+college. We have also propose designs to make benchmark and evaluation process
+more suitable to LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is updated, but we make a mistake that submit a new arxiv
+  paper but not replace this one, the new version is in arXiv:2306.05783</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Progressive-Hint <span class="highlight-title">Prompt</span>ing Improves Reasoning in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09797v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09797v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyang Zheng, Zhengying Liu, Enze Xie, Zhenguo Li, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) in reasoning tasks depends
+heavily on prompt design, with Chain-of-Thought (CoT) and self-consistency
+being critical methods that enhance this ability. However, these methods do not
+fully exploit the answers generated by the LLM to guide subsequent responses.
+This paper proposes a new prompting method, named Progressive-Hint Prompting
+(PHP), that enables automatic multiple interactions between users and LLMs by
+using previously generated answers as hints to progressively guide toward the
+correct answers. PHP is orthogonal to CoT and self-consistency, making it easy
+to combine with state-of-the-art techniques to further improve performance. We
+conducted extensive and comprehensive experiments on seven benchmarks. The
+results show that PHP significantly improves accuracy while remaining highly
+efficient. For instance, with text-davinci-003, we observed a 4.2% improvement
+on GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction
+in sample paths with self-consistency. With GPT-4 and PHP, we achieve
+state-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),
+AQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited
+  Reference Diversity in NLG Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03131v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03131v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfeng Zeng, Yijin Liu, Fandong Meng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely
+utilized across a range of natural language generation (NLG) tasks. However,
+recent studies have revealed a weak correlation between these matching-based
+metrics and human evaluations, especially when compared with neural-based
+metrics like BLEURT. In this paper, we conjecture that the performance
+bottleneck in matching-based metrics may be caused by the limited diversity of
+references. To address this issue, we propose to utilize \textit{multiple
+references} to enhance the consistency between these metrics and human
+evaluations. Within the WMT Metrics benchmarks, we observe that the
+multi-references F200spBLEU surpasses the conventional single-reference one by
+an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based
+BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the
+data leakage issue in large language models (LLMs) can be mitigated to a large
+extent by our multi-reference metric. We release the code and data at
+\url{https://github.com/SefaZeng/LLM-Ref}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Compact End-to-End Model with Local and Global Context for Spoken
+  Language Identification <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Jia, Nithin Rao Koluguri, Jagadeesh Balam, Boris Ginsburg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce TitaNet-LID, a compact end-to-end neural network for Spoken
+Language Identification (LID) that is based on the ContextNet architecture.
+TitaNet-LID employs 1D depth-wise separable convolutions and
+Squeeze-and-Excitation layers to effectively capture local and global context
+within an utterance. Despite its small size, TitaNet-LID achieves performance
+similar to state-of-the-art models on the VoxLingua107 dataset while being 10
+times smaller. Furthermore, it can be easily adapted to new acoustic conditions
+and unseen languages through simple fine-tuning, achieving a state-of-the-art
+accuracy of 88.2% on the FLEURS benchmark. Our model is scalable and can
+achieve a better trade-off between accuracy and speed. TitaNet-LID performs
+well even on short utterances less than 5s in length, indicating its robustness
+to input length.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI
+  LLM: Is it possible to capture editing distance patterns from historical
+  data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serge Gladkoff, Gleb Erofeev, Lifeng Han, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translation Quality Estimation (TQE) is an important step before deploying
+the output translation into usage. TQE is also critical in assessing machine
+translation (MT) and human translation (HT) quality without seeing the
+reference translations. In this work, we examine if the state-of-the-art large
+language models (LLMs) can be fine-tuned for the TQE task and their capability.
+We take ChatGPT as one example and approach TQE as a binary classification
+task. Using English to Italian, German, French, Japanese, Dutch, Portuguese,
+Turkish, and Chinese training corpora, our experimental results show that
+fine-tuned ChatGPT via its API can achieve a relatively high score on
+predicting translation quality, i.e. if the translation needs to be edited, but
+there is definitely much space to improve the accuracy. English-Italiano
+bilingual Abstract is available in the paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 11 figures, under-review to ItalianNLP-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Which Features are Learned by Code<span class="highlight-title">Bert</span>: An Empirical Study of the
+  <span class="highlight-title">BERT</span>-based Source Code Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lan Zhang, Chen Cao, Zhilong Wang, Peng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Bidirectional Encoder Representations from Transformers (BERT) were
+proposed in the natural language process (NLP) and shows promising results.
+Recently researchers applied the BERT to source-code representation learning
+and reported some good news on several downstream tasks. However, in this
+paper, we illustrated that current methods cannot effectively understand the
+logic of source codes. The representation of source code heavily relies on the
+programmer-defined variable and function names. We design and implement a set
+of experiments to demonstrate our conjecture and provide some insights for
+future works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 table, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paras Sheth, Tharindu Kumarage, Raha Moraffah, Aman Chadha, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms, despite their value in promoting open discourse, are
+often exploited to spread harmful content. Current deep learning and natural
+language processing models used for detecting this harmful content overly rely
+on domain-specific terms affecting their capabilities to adapt to generalizable
+hate speech detection. This is because they tend to focus too narrowly on
+particular linguistic signals or the use of certain categories of words.
+Another significant challenge arises when platforms lack high-quality annotated
+data for training, leading to a need for cross-platform models that can adapt
+to different distribution shifts. Our research introduces a cross-platform hate
+speech detection model capable of being trained on one platform's data and
+generalizing to multiple unseen platforms. To achieve good generalizability
+across platforms, one way is to disentangle the input representations into
+invariant and platform-dependent features. We also argue that learning causal
+relationships, which remain constant across diverse environments, can
+significantly aid in understanding invariant representations in hate speech. By
+disentangling input into platform-dependent features (useful for predicting
+hate targets) and platform-independent features (used to predict the presence
+of hate), we learn invariant representations resistant to distribution shifts.
+These features are then used to predict hate speech across unseen platforms.
+Our extensive experiments across four platforms highlight our model's enhanced
+efficacy compared to existing state-of-the-art methods in detecting generalized
+hate speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ There is more than one kind of robustness: Fooling Whisper with
+  adversarial examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Olivier, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whisper is a recent Automatic Speech Recognition (ASR) model displaying
+impressive robustness to both out-of-distribution inputs and random noise. In
+this work, we show that this robustness does not carry over to adversarial
+noise. We show that we can degrade Whisper performance dramatically, or even
+transcribe a target sentence of our choice, by generating very small input
+perturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling
+the Whisper language detector we can very easily degrade the performance of
+multilingual models. These vulnerabilities of a widely popular open-source
+model have practical security implications and emphasize the need for
+adversarially robust ASR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at InterSpeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">124</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Reweighted Least Squares Networks With Convergence Guarantees
+  for Solving Inverse Imaging Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iaroslav Koshelev, Stamatios Lefkimmiatis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present a novel optimization strategy for image
+reconstruction tasks under analysis-based image regularization, which promotes
+sparse and/or low-rank solutions in some learned transform domain. We
+parameterize such regularizers using potential functions that correspond to
+weighted extensions of the $\ell_p^p$-vector and $\mathcal{S}_p^p$
+Schatten-matrix quasi-norms with $0 < p \le 1$. Our proposed minimization
+strategy extends the Iteratively Reweighted Least Squares (IRLS) method,
+typically used for synthesis-based $\ell_p$ and $\mathcal{S}_p$ norm and
+analysis-based $\ell_1$ and nuclear norm regularization. We prove that under
+mild conditions our minimization algorithm converges linearly to a stationary
+point, and we provide an upper bound for its convergence rate. Further, to
+select the parameters of the regularizers that deliver the best results for the
+problem at hand, we propose to learn them from training data by formulating the
+supervised learning process as a stochastic bilevel optimization problem. We
+show that thanks to the convergence guarantees of our proposed minimization
+strategy, such optimization can be successfully performed with a
+memory-efficient implicit back-propagation scheme. We implement our learned
+IRLS variants as recurrent networks and assess their performance on the
+challenging image reconstruction tasks of non-blind deblurring,
+super-resolution and demosaicking. The comparisons against other existing
+learned reconstruction approaches demonstrate that our overall method is very
+competitive and in many cases outperforms existing unrolled networks, whose
+number of parameters is orders of magnitude higher than in our case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.10536</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views
+  with Learnt Shape Programs <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Hu, Jia Zheng, Zixin Zhang, Xiaojun Yuan, Jian Yin, Zihan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we develop a new method to automatically convert 2D line
+drawings from three orthographic views into 3D CAD models. Existing methods for
+this problem reconstruct 3D models by back-projecting the 2D observations into
+3D space while maintaining explicit correspondence between the input and
+output. Such methods are sensitive to errors and noises in the input, thus
+often fail in practice where the input drawings created by human designers are
+imperfect. To overcome this difficulty, we leverage the attention mechanism in
+a Transformer-based sequence generation model to learn flexible mappings
+between the input and output. Further, we design shape programs which are
+suitable for generating the objects of interest to boost the reconstruction
+accuracy and facilitate CAD modeling applications. Experiments on a new
+benchmark dataset show that our method significantly outperforms existing ones
+when the inputs are noisy or incomplete.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear in ICCV 2023. The first three authors contributed equally
+  to this work. The project page is at
+  https://manycore-research.github.io/PlankAssembly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Progressive Meshes <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun-Chun Chen, Vladimir G. Kim, Noam Aigerman, Alec Jacobson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent proliferation of 3D content that can be consumed on hand-held
+devices necessitates efficient tools for transmitting large geometric data,
+e.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a
+challenge to storage as well as transmission bandwidth, and level-of-detail
+techniques are often used to transmit an asset using an appropriate bandwidth
+budget. It is especially desirable for these methods to transmit data
+progressively, improving the quality of the geometry with more data. Our key
+insight is that the geometric details of 3D meshes often exhibit similar local
+patterns even across different shapes, and thus can be effectively represented
+with a shared learned generative space. We learn this space using a
+subdivision-based encoder-decoder architecture trained in advance on a large
+collection of surfaces. We further observe that additional residual features
+can be transmitted progressively between intermediate levels of subdivision
+that enable the client to control the tradeoff between bandwidth cost and
+quality of reconstruction, providing a neural progressive mesh representation.
+We evaluate our method on a diverse set of complex 3D shapes and demonstrate
+that it outperforms baselines in terms of compression ratio and reconstruction
+quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero Grads Ever Given: Learning Local Surrogate Losses for
+  Non-Differentiable Graphics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Fischer, Tobias Ritschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based optimization is now ubiquitous across graphics, but
+unfortunately can not be applied to problems with undefined or zero gradients.
+To circumvent this issue, the loss function can be manually replaced by a
+"surrogate" that has similar minima but is differentiable. Our proposed
+framework, ZeroGrads, automates this process by learning a neural approximation
+of the objective function, the surrogate, which in turn can be used to
+differentiate through arbitrary black-box graphics pipelines. We train the
+surrogate on an actively smoothed version of the objective and encourage
+locality, focusing the surrogate's capacity on what matters at the current
+training episode. The fitting is performed online, alongside the parameter
+optimization, and self-supervised, without pre-computed data or pre-trained
+models. As sampling the objective is expensive (it requires a full rendering or
+simulator run), we devise an efficient sampling scheme that allows for
+tractable run-times and competitive performance at little overhead. We
+demonstrate optimizing diverse non-convex, non-differentiable black-box
+problems in graphics, such as visibility in rendering, discrete parameter
+spaces in procedural modelling or optimal control in physics-driven animation.
+In contrast to more traditional algorithms, our approach scales well to higher
+dimensions, which we demonstrate on problems with up to 35k interlinked
+variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow Anything: Open-set detection, tracking, and following in
+  real-time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Maalouf, Ninad Jadhav, Krishna Murthy Jatavallabhula, Makram Chahine, Daniel M. Vogt, Robert J. Wood, Antonio Torralba, Daniela Rus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking and following objects of interest is critical to several robotics
+use cases, ranging from industrial automation to logistics and warehousing, to
+healthcare and security. In this paper, we present a robotic system to detect,
+track, and follow any object in real-time. Our approach, dubbed ``follow
+anything'' (FAn), is an open-vocabulary and multimodal model -- it is not
+restricted to concepts seen at training time and can be applied to novel
+classes at inference time using text, images, or click queries. Leveraging rich
+visual descriptors from large-scale pre-trained models (foundation models), FAn
+can detect and segment objects by matching multimodal queries (text, images,
+clicks) against an input image sequence. These detected and segmented objects
+are tracked across image frames, all while accounting for occlusion and object
+re-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial
+vehicle) and report its ability to seamlessly follow the objects of interest in
+a real-time control loop. FAn can be deployed on a laptop with a lightweight
+(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To
+enable rapid adoption, deployment, and extensibility, we open-source all our
+code on our project webpage at https://github.com/alaamaalouf/FollowAnything .
+We also encourage the reader the watch our 5-minutes explainer video in this
+https://www.youtube.com/watch?v=6Mgt3EPytrw .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://github.com/alaamaalouf/FollowAnything
+  Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MapTRv2: An End-to-End Framework for Online Vectorized HD Map
+  Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bencheng Liao, Shaoyu Chen, Yunchi Zhang, Bo Jiang, Qian Zhang, Wenyu Liu, Chang Huang, Xinggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-definition (HD) map provides abundant and precise static environmental
+information of the driving scene, serving as a fundamental and indispensable
+component for planning in autonomous driving system. In this paper, we present
+\textbf{Map} \textbf{TR}ansformer, an end-to-end framework for online
+vectorized HD map construction. We propose a unified permutation-equivalent
+modeling approach, \ie, modeling map element as a point set with a group of
+equivalent permutations, which accurately describes the shape of map element
+and stabilizes the learning process. We design a hierarchical query embedding
+scheme to flexibly encode structured map information and perform hierarchical
+bipartite matching for map element learning. To speed up convergence, we
+further introduce auxiliary one-to-many matching and dense supervision. The
+proposed method well copes with various map elements with arbitrary shapes. It
+runs at real-time inference speed and achieves state-of-the-art performance on
+both nuScenes and Argoverse2 datasets. Abundant qualitative results show stable
+and robust map construction quality in complex and various driving scenes. Code
+and more demos are available at \url{https://github.com/hustvl/MapTR} for
+facilitating further studies and applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/hustvl/MapTR . arXiv admin note:
+  substantial text overlap with arXiv:2208.14437</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangkai Xu, Wei Yin, Hao Chen, Chunhua Shen, Kai Cheng, Feng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D scene reconstruction is a long-standing vision task. Existing approaches
+can be categorized into geometry-based and learning-based methods. The former
+leverages multi-view geometry but can face catastrophic failures due to the
+reliance on accurate pixel correspondence across views. The latter was
+proffered to mitigate these issues by learning 2D or 3D representation
+directly. However, without a large-scale video or 3D training data, it can
+hardly generalize to diverse real-world scenarios due to the presence of tens
+of millions or even billions of optimization parameters in the deep network.
+Recently, robust monocular depth estimation models trained with large-scale
+datasets have been proven to possess weak 3D geometry prior, but they are
+insufficient for reconstruction due to the unknown camera parameters, the
+affine-invariant property, and inter-frame inconsistency. Here, we propose a
+novel test-time optimization approach that can transfer the robustness of
+affine-invariant depth models such as LeReS to challenging diverse scenes while
+ensuring inter-frame consistency, with only dozens of parameters to optimize
+per video frame. Specifically, our approach involves freezing the pre-trained
+affine-invariant depth model's depth predictions, rectifying them by optimizing
+the unknown scale-shift values with a geometric consistency alignment module,
+and employing the resulting scale-consistent depth maps to robustly obtain
+camera poses and achieve dense scene reconstruction, even in low-texture
+regions. Experiments show that our method achieves state-of-the-art
+cross-dataset reconstruction on five zero-shot testing datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Project webpage is at:
+  https://aim-uofa.github.io/FrozenRecon/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Integration of Prediction and Planning in Deep Learning-Based
+  Automated Driving Systems: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Hagedorn, Marcel Hallgarten, Martin Stoll, Alexandru Condurache
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated driving has the potential to revolutionize personal, public, and
+freight mobility. Besides the enormous challenge of perception, i.e. accurately
+perceiving the environment using available sensor data, automated driving
+comprises planning a safe, comfortable, and efficient motion trajectory. To
+promote safety and progress, many works rely on modules that predict the future
+motion of surrounding traffic. Modular automated driving systems commonly
+handle prediction and planning as sequential separate tasks. While this
+accounts for the influence of surrounding traffic on the ego-vehicle, it fails
+to anticipate the reactions of traffic participants to the ego-vehicle's
+behavior. Recent works suggest that integrating prediction and planning in an
+interdependent joint step is necessary to achieve safe, efficient, and
+comfortable driving. While various models implement such integrated systems, a
+comprehensive overview and theoretical understanding of different principles
+are lacking. We systematically review state-of-the-art deep learning-based
+prediction, planning, and integrated prediction and planning models. Different
+facets of the integration ranging from model architecture and model design to
+behavioral aspects are considered and related to each other. Moreover, we
+discuss the implications, strengths, and limitations of different integration
+methods. By pointing out research gaps, describing relevant future challenges,
+and highlighting trends in the research field, we identify promising directions
+for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deformable Mixer <span class="highlight-title">Transformer</span> with Gating for Multi-Task Learning of
+  Dense Prediction <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyang Xu, Yibo Yang, Bernard Ghanemm, Lefei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CNNs and Transformers have their own advantages and both have been widely
+used for dense prediction in multi-task learning (MTL). Most of the current
+studies on MTL solely rely on CNN or Transformer. In this work, we present a
+novel MTL model by combining both merits of deformable CNN and query-based
+Transformer with shared gating for multi-task learning of dense prediction.
+This combination may offer a simple and efficient solution owing to its
+powerful and flexible task-specific learning and advantages of lower cost, less
+complexity and smaller parameters than the traditional MTL methods. We
+introduce deformable mixer Transformer with gating (DeMTG), a simple and
+effective encoder-decoder architecture up-to-date that incorporates the
+convolution and attention mechanism in a unified network for MTL. It is
+exquisitely designed to use advantages of each block, and provide deformable
+and comprehensive features for all tasks from local and global perspective.
+First, the deformable mixer encoder contains two types of operators: the
+channel-aware mixing operator leveraged to allow communication among different
+channels, and the spatial-aware deformable operator with deformable convolution
+applied to efficiently sample more informative spatial locations. Second, the
+task-aware gating transformer decoder is used to perform the task-specific
+predictions, in which task interaction block integrated with self-attention is
+applied to capture task interaction features, and the task query block
+integrated with gating attention is leveraged to select corresponding
+task-specific features. Further, the experiment results demonstrate that the
+proposed DeMTG uses fewer GFLOPs and significantly outperforms current
+Transformer-based and CNN-based competitive models on a variety of metrics on
+three dense prediction datasets. Our code and models are available at
+https://github.com/yangyangxu0/DeMTG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Comments: submitted to IJCV; an extension to our previous AAAI 2023
+  paper arXiv:2301.03461</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shadow <span class="highlight-title">Dataset</span>s, New challenging <span class="highlight-title">dataset</span>s for Causal Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Zhu, Hanchen Xie, Jianhua Wu, Jiazhi Li, Mahyar Khayatkhoei, Mohamed E. Hussein, Wael AbdAlmageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering causal relations among semantic factors is an emergent topic in
+representation learning. Most causal representation learning (CRL) methods are
+fully supervised, which is impractical due to costly labeling. To resolve this
+restriction, weakly supervised CRL methods were introduced. To evaluate CRL
+performance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and
+CelebA(SMILE), are utilized. However, existing CRL datasets are limited to
+simple graphs with few generative factors. Thus we propose two new datasets
+with a larger number of diverse generative factors and more sophisticated
+causal graphs. In addition, current real datasets, CelebA(BEARD) and
+CelebA(SMILE), the originally proposed causal graphs are not aligned with the
+dataset distributions. Thus, we propose modifications to them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked Diffusion as <span class="highlight-title">Self-supervised</span> Representation Learner 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Pan, Jianxu Chen, Yiyu Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion probabilistic models have recently demonstrated
+state-of-the-art generative performance and been used as strong pixel-level
+representation learners. This paper decomposes the interrelation between the
+generative capability and representation learning ability inherent in diffusion
+models. We present masked diffusion model (MDM), a scalable self-supervised
+representation learner that substitutes the conventional additive Gaussian
+noise of traditional diffusion with a masking mechanism. Our proposed approach
+convincingly surpasses prior benchmarks, demonstrating remarkable advancements
+in both medical and natural image semantic segmentation tasks, particularly
+within the context of few-shot scenario.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action
+  Recognition with Skeleton-Motion-Informed Gradient 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzhi Lu, He Wang, Ziyi Chang, Guoan Yang, Hubert P. H. Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, methods for skeleton-based human activity recognition have been
+shown to be vulnerable to adversarial attacks. However, these attack methods
+require either the full knowledge of the victim (i.e. white-box attacks),
+access to training data (i.e. transfer-based attacks) or frequent model queries
+(i.e. black-box attacks). All their requirements are highly restrictive,
+raising the question of how detrimental the vulnerability is. In this paper, we
+show that the vulnerability indeed exists. To this end, we consider a new
+attack task: the attacker has no access to the victim model or the training
+data or labels, where we coin the term hard no-box attack. Specifically, we
+first learn a motion manifold where we define an adversarial loss to compute a
+new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our
+gradient contains information of the motion dynamics, which is different from
+existing gradient-based attack methods that compute the loss gradient assuming
+each dimension in the data is independent. The SMI gradient can augment many
+gradient-based attack methods, leading to a new family of no-box attack
+methods. Extensive evaluation and comparison show that our method imposes a
+real threat to existing classifiers. They also show that the SMI gradient
+improves the transferability and imperceptibility of adversarial samples in
+both no-box and transfer-based black-box settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 2D3D-MATR: 2D-3D Matching <span class="highlight-title">Transformer</span> for Detection-free Registration
+  between Images and Point Clouds <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhao Li, Zheng Qin, Zhirui Gao, Renjiao Yi, Chengyang Zhu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The commonly adopted detect-then-match approach to registration finds
+difficulties in the cross-modality cases due to the incompatible keypoint
+detection and inconsistent feature description. We propose, 2D3D-MATR, a
+detection-free method for accurate and robust registration between images and
+point clouds. Our method adopts a coarse-to-fine pipeline where it first
+computes coarse correspondences between downsampled patches of the input image
+and the point cloud and then extends them to form dense correspondences between
+pixels and points within the patch region. The coarse-level patch matching is
+based on transformer which jointly learns global contextual constraints with
+self-attention and cross-modality correlations with cross-attention. To resolve
+the scale ambiguity in patch matching, we construct a multi-scale pyramid for
+each image patch and learn to find for each point patch the best matching image
+patch at a proper resolution level. Extensive experiments on two public
+benchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art
+P2-Net by around $20$ percentage points on inlier ratio and over $10$ points on
+registration recall. Our code and models are available at
+\url{https://github.com/minhaolee/2D3DMATR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AD-CLIP: Adapting Domains in <span class="highlight-title">Prompt</span> Space Using CLIP <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mainak Singha, Harsh Pal, Ankit Jha, Biplab Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although deep learning models have shown impressive performance on supervised
+learning tasks, they often struggle to generalize well when the training
+(source) and test (target) domains differ. Unsupervised domain adaptation (DA)
+has emerged as a popular solution to this problem. However, current DA
+techniques rely on visual backbones, which may lack semantic richness. Despite
+the potential of large-scale vision-language foundation models like CLIP, their
+effectiveness for DA has yet to be fully explored. To address this gap, we
+introduce AD-CLIP, a domain-agnostic prompt learning strategy for CLIP that
+aims to solve the DA problem in the prompt space. We leverage the frozen vision
+backbone of CLIP to extract both image style (domain) and content information,
+which we apply to learn prompt tokens. Our prompts are designed to be
+domain-invariant and class-generalizable, by conditioning prompt learning on
+image style and content features simultaneously. We use standard supervised
+contrastive learning in the source domain, while proposing an entropy
+minimization strategy to align domains in the embedding space given the target
+domain data. We also consider a scenario where only target domain samples are
+available during testing, without any source domain data, and propose a
+cross-domain style mapping network to hallucinate domain-agnostic tokens. Our
+extensive experiments on three benchmark DA datasets demonstrate the
+effectiveness of AD-CLIP compared to existing literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 4 tables. Accepted at OOD-CV, ICCV Workshop,
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-based 3D CNN with Multi-layer Features for Alzheimer's Disease
+  Diagnosis using Brain Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanteng Zhang, Qizhi Teng, Xiaohai He, Tong Niu, Lipei Zhang, Yan Liu, Chao Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural MRI and PET imaging play an important role in the diagnosis of
+Alzheimer's disease (AD), showing the morphological changes and glucose
+metabolism changes in the brain respectively. The manifestations in the brain
+image of some cognitive impairment patients are relatively inconspicuous, for
+example, it still has difficulties in achieving accurate diagnosis through sMRI
+in clinical practice. With the emergence of deep learning, convolutional neural
+network (CNN) has become a valuable method in AD-aided diagnosis, but some CNN
+methods cannot effectively learn the features of brain image, making the
+diagnosis of AD still presents some challenges. In this work, we propose an
+end-to-end 3D CNN framework for AD diagnosis based on ResNet, which integrates
+multi-layer features obtained under the effect of the attention mechanism to
+better capture subtle differences in brain images. The attention maps showed
+our model can focus on key brain regions related to the disease diagnosis. Our
+method was verified in ablation experiments with two modality images on 792
+subjects from the ADNI database, where AD diagnostic accuracies of 89.71% and
+91.18% were achieved based on sMRI and PET respectively, and also outperformed
+some state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Cross-modality Reasoning for Weakly Supervised Video
+  Moment Localization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zezhong Lv, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video moment localization aims to retrieve the target segment of an untrimmed
+video according to the natural language query. Weakly supervised methods gains
+attention recently, as the precise temporal location of the target segment is
+not always available. However, one of the greatest challenges encountered by
+the weakly supervised method is implied in the mismatch between the video and
+language induced by the coarse temporal annotations. To refine the
+vision-language alignment, recent works contrast the cross-modality
+similarities driven by reconstructing masked queries between positive and
+negative video proposals. However, the reconstruction may be influenced by the
+latent spurious correlation between the unmasked and the masked parts, which
+distorts the restoring process and further degrades the efficacy of contrastive
+learning since the masked words are not completely reconstructed from the
+cross-modality knowledge. In this paper, we discover and mitigate this spurious
+correlation through a novel proposed counterfactual cross-modality reasoning
+method. Specifically, we first formulate query reconstruction as an aggregated
+causal effect of cross-modality and query knowledge. Then by introducing
+counterfactual cross-modality knowledge into this aggregation, the spurious
+impact of the unmasked part contributing to the reconstruction is explicitly
+modeled. Finally, by suppressing the unimodal effect of masked query, we can
+rectify the reconstructions of video proposals to perform reasonable
+contrastive learning. Extensive experimental evaluations demonstrate the
+effectiveness of our proposed method. The code is available at
+\href{https://github.com/sLdZ0306/CCR}{https://github.com/sLdZ0306/CCR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqiang Fan, Xiaohao Cai, Mahesan Niranjan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated medical report generation has become increasingly important in
+medical analysis. It can produce computer-aided diagnosis descriptions and thus
+significantly alleviate the doctors' work. Inspired by the huge success of
+neural machine translation and image captioning, various deep learning methods
+have been proposed for medical report generation. However, due to the inherent
+properties of medical data, including data imbalance and the length and
+correlation between report sequences, the generated reports by existing methods
+may exhibit linguistic fluency but lack adequate clinical accuracy. In this
+work, we propose an image-to-indicator hierarchical transformer (IIHT)
+framework for medical report generation. It consists of three modules, i.e., a
+classifier module, an indicator expansion module and a generator module. The
+classifier module first extracts image features from the input medical images
+and produces disease-related indicators with their corresponding states. The
+disease-related indicators are subsequently utilised as input for the indicator
+expansion module, incorporating the "data-text-data" strategy. The
+transformer-based generator then leverages these extracted features along with
+image features as auxiliary information to generate final reports. Furthermore,
+the proposed IIHT method is feasible for radiologists to modify disease
+indicators in real-world scenarios and integrate the operations into the
+indicator expansion module for fluent and accurate medical report generation.
+Extensive experiments and comparisons with state-of-the-art methods under
+various evaluation metrics demonstrate the great performance of the proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Monocular Depth Estimation by Direction-aware Cumulative
+  Convolution Network <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wencheng Han, Junbo Yin, Jianbing Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation is known as an ill-posed task in which objects in
+a 2D image usually do not contain sufficient information to predict their
+depth. Thus, it acts differently from other tasks (e.g., classification and
+segmentation) in many ways. In this paper, we find that self-supervised
+monocular depth estimation shows a direction sensitivity and environmental
+dependency in the feature representation. But the current backbones borrowed
+from other tasks pay less attention to handling different types of
+environmental information, limiting the overall depth accuracy. To bridge this
+gap, we propose a new Direction-aware Cumulative Convolution Network (DaCCN),
+which improves the depth feature representation in two aspects. First, we
+propose a direction-aware module, which can learn to adjust the feature
+extraction in each direction, facilitating the encoding of different types of
+information. Secondly, we design a new cumulative convolution to improve the
+efficiency for aggregating important environmental information. Experiments
+show that our method achieves significant improvements on three widely used
+benchmarks, KITTI, Cityscapes, and Make3D, setting a new state-of-the-art
+performance on the popular benchmarks with all three types of self-supervision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object Goal Navigation with Recursive Implicit Maps <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shizhe Chen, Thomas Chabal, Ivan Laptev, Cordelia Schmid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object goal navigation aims to navigate an agent to locations of a given
+object category in unseen environments. Classical methods explicitly build maps
+of environments and require extensive engineering while lacking semantic
+information for object-oriented exploration. On the other hand, end-to-end
+learning methods alleviate manual map design and predict actions using implicit
+representations. Such methods, however, lack an explicit notion of geometry and
+may have limited ability to encode navigation history. In this work, we propose
+an implicit spatial map for object goal navigation. Our implicit map is
+recursively updated with new observations at each step using a transformer. To
+encourage spatial reasoning, we introduce auxiliary tasks and train our model
+to reconstruct explicit maps as well as to predict visual features, semantic
+labels and actions. Our method significantly outperforms the state of the art
+on the challenging MP3D dataset and generalizes well to the HM3D dataset. We
+successfully deploy our model on a real robot and achieve encouraging object
+goal navigation results in real scenes using only a few real-world
+demonstrations. Code, trained models and videos are available at
+\url{https://www.di.ens.fr/willow/research/onav_rim/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edouard Yvinec, Arnaud Dapogny, Kevin Bailly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network (DNN) deployment has been confined to larger hardware
+devices due to their expensive computational requirements. This challenge has
+recently reached another scale with the emergence of large language models
+(LLMs). In order to reduce both their memory footprint and latency, a promising
+technique is quantization. It consists in converting floating point
+representations to low bit-width fixed point representations, usually by
+assuming a uniform mapping onto a regular grid. This process, referred to in
+the literature as uniform quantization, may however be ill-suited as most DNN
+weights and activations follow a bell-shaped distribution. This is even worse
+on LLMs whose weight distributions are known to exhibit large, high impact,
+outlier values. In this work, we propose an improvement over the most commonly
+adopted way to tackle this limitation in deep learning models quantization,
+namely, non-uniform quantization. NUPES leverages automorphisms to preserve the
+scalar multiplications. Such transformations are derived from power functions.
+However, the optimization of the exponent parameter and weight values remains a
+challenging and novel problem which could not be solved with previous post
+training optimization techniques which only learn to round up or down weight
+values in order to preserve the predictive function. We circumvent this
+limitation with a new paradigm: learning new quantized weights over the entire
+quantized space. Similarly, we enable the optimization of the power exponent,
+i.e. the optimization of the quantization operator itself during training by
+alleviating all the numerical instabilities. The resulting predictive function
+is compatible with integer-only low-bit inference. We show the ability of the
+method to achieve state-of-the-art compression rates in both, data-free and
+data-driven configurations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-Time Selection for Robust Skin Lesion Analysis <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alceu Bissoto, Catarina Barata, Eduardo Valle, Sandra Avila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin lesion analysis models are biased by artifacts placed during image
+acquisition, which influence model predictions despite carrying no clinical
+information. Solutions that address this problem by regularizing models to
+prevent learning those spurious features achieve only partial success, and
+existing test-time debiasing techniques are inappropriate for skin lesion
+analysis due to either making unrealistic assumptions on the distribution of
+test data or requiring laborious annotation from medical practitioners. We
+propose TTS (Test-Time Selection), a human-in-the-loop method that leverages
+positive (e.g., lesion area) and negative (e.g., artifacts) keypoints in test
+samples. TTS effectively steers models away from exploiting spurious
+artifact-related correlations without retraining, and with less annotation
+requirements. Our solution is robust to a varying availability of annotations,
+and different levels of bias. We showcase on the ISIC2019 dataset (for which we
+release a subset of annotated images) how our model could be deployed in the
+real-world for mitigating bias.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISIC Workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Category Feature <span class="highlight-title">Transformer</span> for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Tang, Chuanjian Liu, Fagui Liu, Yifan Liu, Jun Jiang, Bowen Zhang, Kai Han, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aggregation of multi-stage features has been revealed to play a significant
+role in semantic segmentation. Unlike previous methods employing point-wise
+summation or concatenation for feature aggregation, this study proposes the
+Category Feature Transformer (CFT) that explores the flow of category embedding
+and transformation among multi-stage features through the prevalent multi-head
+attention mechanism. CFT learns unified feature embeddings for individual
+semantic categories from high-level features during each aggregation process
+and dynamically broadcasts them to high-resolution features. Integrating the
+proposed CFT into a typical feature pyramid structure exhibits superior
+performance over a broad range of backbone networks. We conduct extensive
+experiments on popular semantic segmentation benchmarks. Specifically, the
+proposed CFT obtains a compelling 55.1% mIoU with greatly reduced model
+parameters and computations on the challenging ADE20K dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual
+  Translation of Dravidian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danish Ebadulla, Rahul Raman, S. Natarajan, Hridhay Kiran Shetty, Ashish Harish Shenoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research in zero-shot translation is plagued by several issues such
+as high compute requirements, increased training time and off target
+translations. Proposed remedies often come at the cost of additional data or
+compute requirements. Pivot based neural machine translation is preferred over
+a single-encoder model for most settings despite the increased training and
+evaluation time. In this work, we overcome the shortcomings of zero-shot
+translation by taking advantage of transliteration and linguistic similarity.
+We build a single encoder-decoder neural machine translation system for
+Dravidian-Dravidian multilingual translation and perform zero-shot translation.
+We compare the data vs zero-shot accuracy tradeoff and evaluate the performance
+of our vanilla method against the current state of the art pivot based method.
+We also test the theory that morphologically rich languages require large
+vocabularies by restricting the vocabulary using an optimal transport based
+technique. Our model manages to achieves scores within 3 BLEU of large-scale
+pivot-based models when it is trained on 50\% of the language directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Domain Product Representation Learning for Rich-Content E-Commerce <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuehan Bai, Yan Li, Yanhua Cheng, Wenjie Yang, Quan Chen, Han Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of short video and live-streaming platforms has
+revolutionized how consumers engage in online shopping. Instead of browsing
+product pages, consumers are now turning to rich-content e-commerce, where they
+can purchase products through dynamic and interactive media like short videos
+and live streams. This emerging form of online shopping has introduced
+technical challenges, as products may be presented differently across various
+media domains. Therefore, a unified product representation is essential for
+achieving cross-domain product recognition to ensure an optimal user search
+experience and effective product recommendations. Despite the urgent industrial
+need for a unified cross-domain product representation, previous studies have
+predominantly focused only on product pages without taking into account short
+videos and live streams. To fill the gap in the rich-content e-commerce area,
+in this paper, we introduce a large-scale cRoss-dOmain Product Ecognition
+dataset, called ROPE. ROPE covers a wide range of product categories and
+contains over 180,000 products, corresponding to millions of short videos and
+live streams. It is the first dataset to cover product pages, short videos, and
+live streams simultaneously, providing the basis for establishing a unified
+product representation across different media domains. Furthermore, we propose
+a Cross-dOmain Product rEpresentation framework, namely COPE, which unifies
+product representations in different domains through multimodal learning
+including text and vision. Extensive experiments on downstream tasks
+demonstrate the effectiveness of COPE in learning a joint feature space for all
+product domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Richardson-Lucy Deconvolution for Low-Light Image Deblurring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Jiawei Zhang, Zhenhua Li, Yunxuan Wei, Faming Fang, Jimmy Ren, Jinshan Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images taken under the low-light condition often contain blur and saturated
+pixels at the same time. Deblurring images with saturated pixels is quite
+challenging. Because of the limited dynamic range, the saturated pixels are
+usually clipped in the imaging process and thus cannot be modeled by the linear
+blur model. Previous methods use manually designed smooth functions to
+approximate the clipping procedure. Their deblurring processes often require
+empirically defined parameters, which may not be the optimal choices for
+different images. In this paper, we develop a data-driven approach to model the
+saturated pixels by a learned latent map. Based on the new model, the non-blind
+deblurring task can be formulated into a maximum a posterior (MAP) problem,
+which can be effectively solved by iteratively computing the latent map and the
+latent image. Specifically, the latent map is computed by learning from a map
+estimation network (MEN), and the latent image estimation process is
+implemented by a Richardson-Lucy (RL)-based updating scheme. To estimate
+high-quality deblurred images without amplified artifacts, we develop a prior
+estimation network (PEN) to obtain prior information, which is further
+integrated into the RL scheme. Experimental results demonstrate that the
+proposed method performs favorably against state-of-the-art algorithms both
+quantitatively and qualitatively on synthetic and real-world images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Asymmetric Loss for Multi-Label Long-Tailed Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wongi Park, Inhyuk Park, Sungeun Kim, Jongbin Ryu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real medical data, training samples typically show long-tailed
+distributions with multiple labels. Class distribution of the medical data has
+a long-tailed shape, in which the incidence of different diseases is quite
+varied, and at the same time, it is not unusual for images taken from
+symptomatic patients to be multi-label diseases. Therefore, in this paper, we
+concurrently address these two issues by putting forth a robust asymmetric loss
+on the polynomial function. Since our loss tackles both long-tailed and
+multi-label classification problems simultaneously, it leads to a complex
+design of the loss function with a large number of hyper-parameters. Although a
+model can be highly fine-tuned due to a large number of hyper-parameters, it is
+difficult to optimize all hyper-parameters at the same time, and there might be
+a risk of overfitting a model. Therefore, we regularize the loss function using
+the Hill loss approach, which is beneficial to be less sensitive against the
+numerous hyper-parameters so that it reduces the risk of overfitting the model.
+For this reason, the proposed loss is a generic method that can be applied to
+most medical image classification tasks and does not make the training process
+more time-consuming. We demonstrate that the proposed robust asymmetric loss
+performs favorably against the long-tailed with multi-label medical image
+classification in addition to the various long-tailed single-label datasets.
+Notably, our method achieves Top-5 results on the CXR-LT dataset of the ICCV
+CVAMD 2023 competition. We opensource our implementation of the robust
+asymmetric loss in the public repository: https://github.com/kalelpark/RAL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is there progress in activity progress prediction? <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frans de Boer, Jan C. van Gemert, Jouke Dijkstra, Silvia L. Pintea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activity progress prediction aims to estimate what percentage of an activity
+has been completed. Currently this is done with machine learning approaches,
+trained and evaluated on complicated and realistic video datasets. The videos
+in these datasets vary drastically in length and appearance. And some of the
+activities have unanticipated developments, making activity progression
+difficult to estimate. In this work, we examine the results obtained by
+existing progress prediction methods on these datasets. We find that current
+progress prediction methods seem not to extract useful visual information for
+the progress prediction task. Therefore, these methods fail to exceed simple
+frame-counting baselines. We design a precisely controlled dataset for activity
+progress prediction and on this synthetic dataset we show that the considered
+methods can make use of the visual information, when this directly relates to
+the progress prediction. We conclude that the progress prediction task is
+ill-posed on the currently used real-world datasets. Moreover, to fairly
+measure activity progression we advise to consider a, simple but effective,
+frame-counting baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCVw-2023 (AI for Creative Video Editing and
+  Understanding, ICCV workshop 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Critical Points ++: An Agile Point Cloud Importance Measure for Robust
+  Classification, Adversarial Defense and Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meir Yossef Levi, Guy Gilboa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to cope accurately and fast with Out-Of-Distribution (OOD)
+samples is crucial in real-world safety demanding applications. In this work we
+first study the interplay between critical points of 3D point clouds and OOD
+samples. Our findings are that common corruptions and outliers are often
+interpreted as critical points. We generalize the notion of critical points
+into importance measures. We show that training a classification network based
+only on less important points dramatically improves robustness, at a cost of
+minor performance loss on the clean set. We observe that normalized entropy is
+highly informative for corruption analysis. An adaptive threshold based on
+normalized entropy is suggested for selecting the set of uncritical points. Our
+proposed importance measure is extremely fast to compute. We show it can be
+used for a variety of applications, such as Explainable AI (XAI), Outlier
+Removal, Uncertainty Estimation, Robust Classification and Adversarial Defense.
+We reach SOTA results on the two latter tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation
+  for Panoramic Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Zheng, Tianbo Pan, Yunhao Luo, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endeavors have been recently made to transfer knowledge from the labeled
+pinhole image domain to the unlabeled panoramic image domain via Unsupervised
+Domain Adaptation (UDA). The aim is to tackle the domain gaps caused by the
+style disparities and distortion problem from the non-uniformly distributed
+pixels of equirectangular projection (ERP). Previous works typically focus on
+transferring knowledge based on geometric priors with specially designed
+multi-branch network architectures. As a result, considerable computational
+costs are induced, and meanwhile, their generalization abilities are profoundly
+hindered by the variation of distortion among pixels. In this paper, we find
+that the pixels' neighborhood regions of the ERP indeed introduce less
+distortion. Intuitively, we propose a novel UDA framework that can effectively
+address the distortion problems for panoramic semantic segmentation. In
+comparison, our method is simpler, easier to implement, and more
+computationally efficient. Specifically, we propose distortion-aware attention
+(DA) capturing the neighboring pixel distribution without using any geometric
+constraints. Moreover, we propose a class-wise feature aggregation (CFA) module
+to iteratively update the feature representations with a memory bank. As such,
+the feature similarity between two domains can be consistently optimized.
+Extensive experiments show that our method achieves new state-of-the-art
+performance while remarkably reducing 80% parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-time
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuming Chen, Xinbin Yuan, Ruiqi Wu, Jiabao Wang, Qibin Hou, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim at providing the object detection community with an efficient and
+performant object detector, termed YOLO-MS. The core design is based on a
+series of investigations on how convolutions with different kernel sizes affect
+the detection performance of objects at different scales. The outcome is a new
+strategy that can strongly enhance multi-scale feature representations of
+real-time object detectors. To verify the effectiveness of our strategy, we
+build a network architecture, termed YOLO-MS. We train our YOLO-MS on the MS
+COCO dataset from scratch without relying on any other large-scale datasets,
+like ImageNet, or pre-trained weights. Without bells and whistles, our YOLO-MS
+outperforms the recent state-of-the-art real-time object detectors, including
+YOLO-v7 and RTMDet, when using a comparable number of parameters and FLOPs.
+Taking the XS version of YOLO-MS as an example, with only 4.5M learnable
+parameters and 8.7G FLOPs, it can achieve an AP score of 43%+ on MS COCO, which
+is about 2%+ higher than RTMDet with the same model size. Moreover, our work
+can also be used as a plug-and-play module for other YOLO models. Typically,
+our method significantly improves the AP of YOLOv8 from 37%+ to 40%+ with even
+fewer parameters and FLOPs. Code is available at
+https://github.com/FishAndWasabi/YOLO-MS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Review</span>ing 3D Object Detectors in the Context of High-Resolution 3+1D
+  Radar <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Palmer, Martin Krueger, Richard Altendorfer, Ganesh Adam, Torsten Bertram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments and the beginning market introduction of high-resolution
+imaging 4D (3+1D) radar sensors have initialized deep learning-based radar
+perception research. We investigate deep learning-based models operating on
+radar point clouds for 3D object detection. 3D object detection on lidar point
+cloud data is a mature area of 3D vision. Many different architectures have
+been proposed, each with strengths and weaknesses. Due to similarities between
+3D lidar point clouds and 3+1D radar point clouds, those existing 3D object
+detectors are a natural basis to start deep learning-based 3D object detection
+on radar data. Thus, the first step is to analyze the detection performance of
+the existing models on the new data modality and evaluate them in depth. In
+order to apply existing 3D point cloud object detectors developed for lidar
+point clouds to the radar domain, they need to be adapted first. While some
+detectors, such as PointPillars, have already been adapted to be applicable to
+radar data, we have adapted others, e.g., Voxel R-CNN, SECOND, PointRCNN, and
+PV-RCNN. To this end, we conduct a cross-model validation (evaluating a set of
+models on one particular data set) as well as a cross-data set validation
+(evaluating all models in the model set on several data sets). The
+high-resolution radar data used are the View-of-Delft and Astyx data sets.
+Finally, we evaluate several adaptations of the models and their training
+procedures. We also discuss major factors influencing the detection performance
+on radar data and propose possible solutions indicating potential future
+research avenues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at CVPR 2023 Workshop on 3D Vision and Robotics
+  (https://drive.google.com/file/d/1xj4R5ucH3PaR7QdRDJbbkjS-3iBUsruR/view)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surface Masked AutoEncoder: Self-Supervision for Cortical Imaging Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Dahan, Mariana da Silva, Daniel Rueckert, Emma C Robinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervision has been widely explored as a means of addressing the lack
+of inductive biases in vision transformer architectures, which limits
+generalisation when networks are trained on small datasets. This is crucial in
+the context of cortical imaging, where phenotypes are complex and
+heterogeneous, but the available datasets are limited in size. This paper
+builds upon recent advancements in translating vision transformers to surface
+meshes and investigates the potential of Masked AutoEncoder (MAE)
+self-supervision for cortical surface learning. By reconstructing surface data
+from a masked version of the input, the proposed method effectively models
+cortical structure to learn strong representations that translate to improved
+performance in downstream tasks. We evaluate our approach on cortical phenotype
+regression using the developing Human Connectome Project (dHCP) and demonstrate
+that pre-training leads to a 26\% improvement in performance, with an 80\%
+faster convergence, compared to models trained from scratch. Furthermore, we
+establish that pre-training vision transformer models on large datasets, such
+as the UK Biobank (UKB), enables the acquisition of robust representations for
+finetuning in low-data scenarios. Our code and pre-trained models are publicly
+available at \url{https://github.com/metrics-lab/surface-vision-transformers}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KS-APR: Keyframe Selection for Robust Absolute Pose Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changkun Liu, Yukun Zhao, Tristan Braud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Markerless Mobile Augmented Reality (AR) aims to anchor digital content in
+the physical world without using specific 2D or 3D objects. Absolute Pose
+Regressors (APR) are end-to-end machine learning solutions that infer the
+device's pose from a single monocular image. Thanks to their low computation
+cost, they can be directly executed on the constrained hardware of mobile AR
+devices. However, APR methods tend to yield significant inaccuracies for input
+images that are too distant from the training set. This paper introduces
+KS-APR, a pipeline that assesses the reliability of an estimated pose with
+minimal overhead by combining the inference results of the APR and the prior
+images in the training set. Mobile AR systems tend to rely upon visual-inertial
+odometry to track the relative pose of the device during the experience. As
+such, KS-APR favours reliability over frequency, discarding unreliable poses.
+This pipeline can integrate most existing APR methods to improve accuracy by
+filtering unreliable images with their pose estimates. We implement the
+pipeline on three types of APR models on indoor and outdoor datasets. The
+median error on position and orientation is reduced for all models, and the
+proportion of large errors is minimized across datasets. Our method enables
+state-of-the-art APRs such as DFNetdm to outperform single-image and sequential
+APR methods. These results demonstrate the scalability and effectiveness of
+KS-APR for visual localization tasks that do not require one-shot decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transforming Breast Cancer Diagnosis: Towards Real-Time Ultrasound to
+  Mammogram Conversion for Cost-Effective Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Almahfouz Nasser, Ashutosh Sharma, Anmol Saraf, Amruta Mahendra Parulekar, Purvi Haria, Amit Sethi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound (US) imaging is better suited for intraoperative settings because
+it is real-time and more portable than other imaging techniques, such as
+mammography. However, US images are characterized by lower spatial resolution
+noise-like artifacts. This research aims to address these limitations by
+providing surgeons with mammogram-like image quality in real-time from noisy US
+images. Unlike previous approaches for improving US image quality that aim to
+reduce artifacts by treating them as (speckle noise), we recognize their value
+as informative wave interference pattern (WIP). To achieve this, we utilize the
+Stride software to numerically solve the forward model, generating ultrasound
+images from mammograms images by solving wave-equations. Additionally, we
+leverage the power of domain adaptation to enhance the realism of the simulated
+ultrasound images. Then, we utilize generative adversarial networks (GANs) to
+tackle the inverse problem of generating mammogram-quality images from
+ultrasound images. The resultant images have considerably more discernible
+details than the original US images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generalized Physical-knowledge-guided Dynamic Model for Underwater
+  Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Mu, Hanning Xu, Zheyuan Liu, Zheng Wang, Sixian Chan, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater images often suffer from color distortion and low contrast
+resulting in various image types, due to the scattering and absorption of light
+by water. While it is difficult to obtain high-quality paired training samples
+with a generalized model. To tackle these challenges, we design a Generalized
+Underwater image enhancement method via a Physical-knowledge-guided Dynamic
+Model (short for GUPDM), consisting of three parts: Atmosphere-based Dynamic
+Structure (ADS), Transmission-guided Dynamic Structure (TDS), and Prior-based
+Multi-scale Structure (PMS). In particular, to cover complex underwater scenes,
+this study changes the global atmosphere light and the transmission to simulate
+various underwater image types (e.g., the underwater image color ranging from
+yellow to blue) through the formation model. We then design ADS and TDS that
+use dynamic convolutions to adaptively extract prior information from
+underwater images and generate parameters for PMS. These two modules enable the
+network to select appropriate parameters for various water types adaptively.
+Besides, the multi-scale feature extraction module in PMS uses convolution
+blocks with different kernel sizes and obtains weights for each feature map via
+channel attention block and fuses them to boost the receptive field of the
+network. The source code will be available at
+\href{https://github.com/shiningZZ/GUPDM}{https://github.com/shiningZZ/GUPDM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Algorithmic Bias in Face Recognition: An Experimental
+  Approach Using Synthetic Faces and Human Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liang, Pietro Perona, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an experimental method for measuring bias in face recognition
+systems. Existing methods to measure bias depend on benchmark datasets that are
+collected in the wild and annotated for protected (e.g., race, gender) and
+non-protected (e.g., pose, lighting) attributes. Such observational datasets
+only permit correlational conclusions, e.g., "Algorithm A's accuracy is
+different on female and male faces in dataset X.". By contrast, experimental
+methods manipulate attributes individually and thus permit causal conclusions,
+e.g., "Algorithm A's accuracy is affected by gender and skin color."
+  Our method is based on generating synthetic faces using a neural face
+generator, where each attribute of interest is modified independently while
+leaving all other attributes constant. Human observers crucially provide the
+ground truth on perceptual identity similarity between synthetic image pairs.
+We validate our method quantitatively by evaluating race and gender biases of
+three research-grade face recognition models. Our synthetic pipeline reveals
+that for these algorithms, accuracy is lower for Black and East Asian
+population subgroups. Our method can also quantify how perceptual changes in
+attributes affect face identity distances reported by these models. Our large
+synthetic dataset, consisting of 48,000 synthetic face image pairs (10,200
+unique synthetic faces) and 555,000 human annotations (individual attributes
+and pairwise identity comparisons) is available to researchers in this
+important area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to iccv2023; 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Fusion <span class="highlight-title">Transformer</span> Network with Weighted Vector-Wise Keypoints
+  Voting for Robust 6D Object Pose Estimation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Zhou, Kai Chen, Linlin Xu, Qi Dou, Jing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One critical challenge in 6D object pose estimation from a single RGBD image
+is efficient integration of two different modalities, i.e., color and depth. In
+this work, we tackle this problem by a novel Deep Fusion Transformer~(DFTr)
+block that can aggregate cross-modality features for improving pose estimation.
+Unlike existing fusion methods, the proposed DFTr can better model
+cross-modality semantic correlation by leveraging their semantic similarity,
+such that globally enhanced features from different modalities can be better
+integrated for improved information extraction. Moreover, to further improve
+robustness and efficiency, we introduce a novel weighted vector-wise voting
+algorithm that employs a non-iterative global optimization strategy for precise
+3D keypoint localization while achieving near real-time inference. Extensive
+experiments show the effectiveness and strong generalization capability of our
+proposed 3D keypoint voting algorithm. Results on four widely used benchmarks
+also demonstrate that our method outperforms the state-of-the-art methods by
+large margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensemble Modeling for Multimodal Visual Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jyoti Kini, Sarah Fleischer, Ishan Dave, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose an ensemble modeling approach for multimodal action
+recognition. We independently train individual modality models using a variant
+of focal loss tailored to handle the long-tailed distribution of the MECCANO
+[21] dataset. Based on the underlying principle of focal loss, which captures
+the relationship between tail (scarce) classes and their prediction
+difficulties, we propose an exponentially decaying variant of focal loss for
+our current task. It initially emphasizes learning from the hard misclassified
+examples and gradually adapts to the entire range of examples in the dataset.
+This annealing process encourages the model to strike a balance between
+focusing on the sparse set of hard samples, while still leveraging the
+information provided by the easier ones. Additionally, we opt for the late
+fusion strategy to combine the resultant probability distributions from RGB and
+Depth modalities for final action prediction. Experimental evaluations on the
+MECCANO dataset demonstrate the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report accepted at the Multimodal Action Recognition
+  Challenge on the MECCANO Dataset - ICIAP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speech-Driven 3D Face Animation with Composite and Regional Facial
+  Movements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Wu, Songtao Zhou, Jia Jia, Junliang Xing, Qi Wen, Xiang Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D face animation poses significant challenges due to the
+intricacy and variability inherent in human facial movements. This paper
+emphasizes the importance of considering both the composite and regional
+natures of facial movements in speech-driven 3D face animation. The composite
+nature pertains to how speech-independent factors globally modulate
+speech-driven facial movements along the temporal dimension. Meanwhile, the
+regional nature alludes to the notion that facial movements are not globally
+correlated but are actuated by local musculature along the spatial dimension.
+It is thus indispensable to incorporate both natures for engendering vivid
+animation. To address the composite nature, we introduce an adaptive modulation
+module that employs arbitrary facial movements to dynamically adjust
+speech-driven facial movements across frames on a global scale. To accommodate
+the regional nature, our approach ensures that each constituent of the facial
+features for every frame focuses on the local spatial movements of 3D faces.
+Moreover, we present a non-autoregressive backbone for translating audio to 3D
+facial movements, which maintains high-frequency nuances of facial movements
+and facilitates efficient inference. Comprehensive experiments and user studies
+demonstrate that our method surpasses contemporary state-of-the-art approaches
+both qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MM 2023, 9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Low Rank Adaptation of Segment Anything to Salient Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruikai Cui, Siyuan He, Shi Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models, such as OpenAI's GPT-3 and GPT-4, Meta's LLaMA, and
+Google's PaLM2, have revolutionized the field of artificial intelligence. A
+notable paradigm shift has been the advent of the Segment Anything Model (SAM),
+which has exhibited a remarkable capability to segment real-world objects,
+trained on 1 billion masks and 11 million images. Although SAM excels in
+general object segmentation, it lacks the intrinsic ability to detect salient
+objects, resulting in suboptimal performance in this domain. To address this
+challenge, we present the Segment Salient Object Model (SSOM), an innovative
+approach that adaptively fine-tunes SAM for salient object detection by
+harnessing the low-rank structure inherent in deep learning. Comprehensive
+qualitative and quantitative evaluations across five challenging RGB benchmark
+datasets demonstrate the superior performance of our approach, surpassing
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 0 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Spatio-temporal Perception for Audio-Visual Question
+  Answering <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyao Li, Wenxuan Hou, Di Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-Visual Question Answering (AVQA) task aims to answer questions about
+different visual objects, sounds, and their associations in videos. Such
+naturally multi-modal videos are composed of rich and complex dynamic
+audio-visual components, where most of which could be unrelated to the given
+questions, or even play as interference in answering the content of interest.
+Oppositely, only focusing on the question-aware audio-visual content could get
+rid of influence, meanwhile enabling the model to answer more efficiently. In
+this paper, we propose a Progressive Spatio-Temporal Perception Network
+(PSTP-Net), which contains three modules that progressively identify key
+spatio-temporal regions w.r.t. questions. Specifically, a temporal segment
+selection module is first introduced to select the most relevant audio-visual
+segments related to the given question. Then, a spatial region selection module
+is utilized to choose the most relevant regions associated with the question
+from the selected temporal segments. To further refine the selection of
+features, an audio-guided visual attention module is employed to perceive the
+association between auido and selected spatial regions. Finally, the
+spatio-temporal features from these modules are integrated for answering the
+question. Extensive experimental results on the public MUSIC-AVQA and AVQA
+datasets provide compelling evidence of the effectiveness and efficiency of
+PSTP-Net. Code is available at:
+\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SC3K: <span class="highlight-title">Self-supervised</span> and Coherent 3D Keypoints Estimation from Rotated,
+  Noisy, and Decimated Point Cloud Data <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Zohaib, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new method to infer keypoints from arbitrary object
+categories in practical scenarios where point cloud data (PCD) are noisy,
+down-sampled and arbitrarily rotated. Our proposed model adheres to the
+following principles: i) keypoints inference is fully unsupervised (no
+annotation given), ii) keypoints position error should be low and resilient to
+PCD perturbations (robustness), iii) keypoints should not change their indexes
+for the intra-class objects (semantic coherence), iv) keypoints should be close
+to or proximal to PCD surface (compactness). We achieve these desiderata by
+proposing a new self-supervised training strategy for keypoints estimation that
+does not assume any a priori knowledge of the object class, and a model
+architecture with coupled auxiliary losses that promotes the desired keypoints
+properties. We compare the keypoints estimated by the proposed approach with
+those of the state-of-the-art unsupervised approaches. The experiments show
+that our approach outperforms by estimating keypoints with improved coverage
+(+9.41%) while being semantically consistent (+4.66%) that best characterizes
+the object's 3D shape for downstream tasks. Code and data are available at:
+https://github.com/IITPAVIS/SC3K
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted in International Conference on Computer
+  Vision (ICCV) 2023. For code and data, please refer to the following GitHub
+  page: https://github.com/IITPAVIS/SC3K</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Assessment of Multi-view fusion learning for Crop
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mena, Diego Arenas, Marlon Nuske, Andreas Dengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With a rapidly increasing amount and diversity of remote sensing (RS) data
+sources, there is a strong need for multi-view learning modeling. This is a
+complex task when considering the differences in resolution, magnitude, and
+noise of RS data. The typical approach for merging multiple RS sources has been
+input-level fusion, but other - more advanced - fusion strategies may
+outperform this traditional approach. This work assesses different fusion
+strategies for crop classification in the CropHarvest dataset. The fusion
+methods proposed in this work outperform models based on individual views and
+previous fusion methods. We do not find one single fusion method that
+consistently outperforms all other approaches. Instead, we present a comparison
+of multi-view fusion methods for three different datasets and show that,
+depending on the test region, different methods obtain the best performance.
+Despite this, we suggest a preliminary criterion for the selection of fusion
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE International Geoscience and Remote Sensing
+  Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Low-light Light Field Images with A Deep Compensation
+  Unfolding Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianqiang Lyu, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel and interpretable end-to-end learning framework,
+called the deep compensation unfolding network (DCUNet), for restoring light
+field (LF) images captured under low-light conditions. DCUNet is designed with
+a multi-stage architecture that mimics the optimization process of solving an
+inverse imaging problem in a data-driven fashion. The framework uses the
+intermediate enhanced result to estimate the illumination map, which is then
+employed in the unfolding process to produce a new enhanced result.
+Additionally, DCUNet includes a content-associated deep compensation module at
+each optimization stage to suppress noise and illumination map estimation
+errors. To properly mine and leverage the unique characteristics of LF images,
+this paper proposes a pseudo-explicit feature interaction module that
+comprehensively exploits redundant information in LF images. The experimental
+results on both simulated and real datasets demonstrate the superiority of our
+DCUNet over state-of-the-art methods, both qualitatively and quantitatively.
+Moreover, DCUNet preserves the essential geometric structure of enhanced LF
+images much better. The code will be publicly available at
+https://github.com/lyuxianqiang/LFLL-DCU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Gabor Texture Features for Fine-Grained Recognition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanyun Zhu, Tianrun Chen, Jianxiong Yin, Simon See, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting and using class-discriminative features is critical for
+fine-grained recognition. Existing works have demonstrated the possibility of
+applying deep CNNs to exploit features that distinguish similar classes.
+However, CNNs suffer from problems including frequency bias and loss of
+detailed local information, which restricts the performance of recognizing
+fine-grained categories. To address the challenge, we propose a novel texture
+branch as complimentary to the CNN branch for feature extraction. We
+innovatively utilize Gabor filters as a powerful extractor to exploit texture
+features, motivated by the capability of Gabor filters in effectively capturing
+multi-frequency features and detailed local information. We implement several
+designs to enhance the effectiveness of Gabor filters, including imposing
+constraints on parameter values and developing a learning method to determine
+the optimal parameters. Moreover, we introduce a statistical feature extractor
+to utilize informative statistical information from the signals captured by
+Gabor filters, and a gate selection mechanism to enable efficient computation
+by only considering qualified regions as input for texture extraction. Through
+the integration of features from the Gabor-filter-based texture branch and
+CNN-based semantic branch, we achieve comprehensive information extraction. We
+demonstrate the efficacy of our method on multiple datasets, including
+CUB-200-2011, NA-bird, Stanford Dogs, and GTOS-mobile. State-of-the-art
+performance is achieved using our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Localization with Visual-Inertial Odometry Constraints for
+  Markerless Mobile AR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changkun Liu, Yukun Zhao, Tristan Braud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Inertial Odometry (VIO) is an essential component of modern Augmented
+Reality (AR) applications. However, VIO only tracks the relative pose of the
+device, leading to drift over time. Absolute pose estimation methods infer the
+device's absolute pose, but their accuracy depends on the input quality. This
+paper introduces VIO-APR, a new framework for markerless mobile AR that
+combines an absolute pose regressor (APR) with a local VIO tracking system.
+VIO-APR uses VIO to assess the reliability of the APR and the APR to identify
+and compensate for VIO drift. This feedback loop results in more accurate
+positioning and more stable AR experiences. To evaluate VIO-APR, we created a
+dataset that combines camera images with ARKit's VIO system output for six
+indoor and outdoor scenes of various scales. Over this dataset, VIO-APR
+improves the median accuracy of popular APR by up to 36\% in position and 29\%
+in orientation, increases the percentage of frames in the high ($0.25 m,
+2^{\circ}$) accuracy level by up to 112\% and reduces the percentage of frames
+predicted below the low ($5 m, 10^\circ$) accuracy greatly. We implement
+VIO-APR into a mobile AR application using Unity to demonstrate its
+capabilities. VIO-APR results in noticeably more accurate localization and a
+more stable overall experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Product <span class="highlight-title">Review</span> Image Ranking for Fashion E-commerce <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangeet Jaiswal, Dhruv Patel, Sreekanth Vempati, Konduru Saiswaroop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a fashion e-commerce platform where customers can't physically examine the
+products on their own, being able to see other customers' text and image
+reviews of the product is critical while making purchase decisions. Given the
+high reliance on these reviews, over the years we have observed customers
+proactively sharing their reviews. With an increase in the coverage of User
+Generated Content (UGC), there has been a corresponding increase in the number
+of customer images. It is thus imperative to display the most relevant images
+on top as it may influence users' online shopping choices and behavior. In this
+paper, we propose a simple yet effective training procedure for ranking
+customer images. We created a dataset consisting of Myntra (A Major Indian
+Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)
+UGC images as our starting point and used selected distortion techniques on the
+images of the above dataset to bring their quality at par with those of bad UGC
+images. We train our network to rank bad-quality images lower than high-quality
+ones. Our proposed method outperforms the baseline models on two metrics,
+namely correlation coefficient, and accuracy, by substantial margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR
+  eCom'22)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HGDNet: A Height-Hierarchy Guided Dual-Decoder Network for Single View
+  Building Extraction and Height Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoran Lu, Ningning Cao, Pan Zhang, Ting Liu, Baochai Peng, Guozhang Liu, Mengke Yuan, Sen Zhang, Simin Huang, Tao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unifying the correlative single-view satellite image building extraction and
+height estimation tasks indicates a promising way to share representations and
+acquire generalist model for large-scale urban 3D reconstruction. However, the
+common spatial misalignment between building footprints and
+stereo-reconstructed nDSM height labels incurs degraded performance on both
+tasks. To address this issue, we propose a Height-hierarchy Guided Dual-decoder
+Network (HGDNet) to estimate building height. Under the guidance of synthesized
+discrete height-hierarchy nDSM, auxiliary height-hierarchical building
+extraction branch enhance the height estimation branch with implicit
+constraints, yielding an accuracy improvement of more than 6% on the DFC 2023
+track2 dataset. Additional two-stage cascade architecture is adopted to achieve
+more accurate building extraction. Experiments on the DFC 2023 Track 2 dataset
+shows the superiority of the proposed method in building height estimation
+({\delta}1:0.8012), instance extraction (AP50:0.7730), and the final average
+score 0.7871 ranks in the first place in test phase.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interaction-aware Joint Attention Estimation Using People Attributes <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chihiro Nakatani, Hiroaki Kawashima, Norimichi Ukita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes joint attention estimation in a single image. Different
+from related work in which only the gaze-related attributes of people are
+independently employed, (I) their locations and actions are also employed as
+contextual cues for weighting their attributes, and (ii) interactions among all
+of these attributes are explicitly modeled in our method. For the interaction
+modeling, we propose a novel Transformer-based attention network to encode
+joint attention as low-dimensional features. We introduce a specialized MLP
+head with positional embedding to the Transformer so that it predicts pixelwise
+confidence of joint attention for generating the confidence heatmap. This
+pixelwise prediction improves the heatmap accuracy by avoiding the ill-posed
+problem in which the high-dimensional heatmap is predicted from the
+low-dimensional features. The estimated joint attention is further improved by
+being integrated with general image-based attention estimation. Our method
+outperforms SOTA methods quantitatively in comparative experiments. Code:
+https://anonymous.4open.science/r/anonymized_codes-ECA4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flexible Isosurface Extraction for Gradient-Based Mesh Optimization <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchang Shen, Jacob Munkberg, Jon Hasselgren, Kangxue Yin, Zian Wang, Wenzheng Chen, Zan Gojcic, Sanja Fidler, Nicholas Sharp, Jun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers gradient-based mesh optimization, where we iteratively
+optimize for a 3D surface mesh by representing it as the isosurface of a scalar
+field, an increasingly common paradigm in applications including
+photogrammetry, generative modeling, and inverse physics. Existing
+implementations adapt classic isosurface extraction algorithms like Marching
+Cubes or Dual Contouring; these techniques were designed to extract meshes from
+fixed, known fields, and in the optimization setting they lack the degrees of
+freedom to represent high-quality feature-preserving meshes, or suffer from
+numerical instabilities. We introduce FlexiCubes, an isosurface representation
+specifically designed for optimizing an unknown mesh with respect to geometric,
+visual, or even physical objectives. Our main insight is to introduce
+additional carefully-chosen parameters into the representation, which allow
+local flexible adjustments to the extracted mesh geometry and connectivity.
+These parameters are updated along with the underlying scalar field via
+automatic differentiation when optimizing for a downstream task. We base our
+extraction scheme on Dual Marching Cubes for improved topological properties,
+and present extensions to optionally generate tetrahedral and
+hierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on
+both synthetic benchmarks and real-world applications, showing that it offers
+significant improvements in mesh quality and geometric fidelity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2023. Project page:
+  https://research.nvidia.com/labs/toronto-ai/flexicubes/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriDo-Former: A Triple-Domain <span class="highlight-title">Transformer</span> for Direct PET Reconstruction
+  from Low-Dose Sinograms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Cui, Pinxian Zeng, Xinyi Zeng, Peng Wang, Xi Wu, Jiliu Zhou, Yan Wang, Dinggang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To obtain high-quality positron emission tomography (PET) images while
+minimizing radiation exposure, various methods have been proposed for
+reconstructing standard-dose PET (SPET) images from low-dose PET (LPET)
+sinograms directly. However, current methods often neglect boundaries during
+sinogram-to-image reconstruction, resulting in high-frequency distortion in the
+frequency domain and diminished or fuzzy edges in the reconstructed images.
+Furthermore, the convolutional architectures, which are commonly used, lack the
+ability to model long-range non-local interactions, potentially leading to
+inaccurate representations of global structures. To alleviate these problems,
+we propose a transformer-based model that unites triple domains of sinogram,
+image, and frequency for direct PET reconstruction, namely TriDo-Former.
+Specifically, the TriDo-Former consists of two cascaded networks, i.e., a
+sinogram enhancement transformer (SE-Former) for denoising the input LPET
+sinograms and a spatial-spectral reconstruction transformer (SSR-Former) for
+reconstructing SPET images from the denoised sinograms. Different from the
+vanilla transformer that splits an image into 2D patches, based specifically on
+the PET imaging mechanism, our SE-Former divides the sinogram into 1D
+projection view angles to maintain its inner-structure while denoising,
+preventing the noise in the sinogram from prorogating into the image domain.
+Moreover, to mitigate high-frequency distortion and improve reconstruction
+details, we integrate global frequency parsers (GFPs) into SSR-Former. The GFP
+serves as a learnable frequency filter that globally adjusts the frequency
+components in the frequency domain, enforcing the network to restore
+high-frequency details resembling real SPET images. Validations on a clinical
+dataset demonstrate that our TriDo-Former outperforms the state-of-the-art
+methods qualitatively and quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pseudo-label Alignment for Semi-supervised Instance Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Hu, Chen Chen, Liujuan Cao, Shengchuan Zhang, Annan Shu, Guannan Jiang, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pseudo-labeling is significant for semi-supervised instance segmentation,
+which generates instance masks and classes from unannotated images for
+subsequent training. However, in existing pipelines, pseudo-labels that contain
+valuable information may be directly filtered out due to mismatches in class
+and mask quality. To address this issue, we propose a novel framework, called
+pseudo-label aligning instance segmentation (PAIS), in this paper. In PAIS, we
+devise a dynamic aligning loss (DALoss) that adjusts the weights of
+semi-supervised loss terms with varying class and mask score pairs. Through
+extensive experiments conducted on the COCO and Cityscapes datasets, we
+demonstrate that PAIS is a promising framework for semi-supervised instance
+segmentation, particularly in cases where labeled data is severely limited.
+Notably, with just 1\% labeled data, PAIS achieves 21.2 mAP (based on
+Mask-RCNN) and 19.9 mAP (based on K-Net) on the COCO dataset, outperforming the
+current state-of-the-art model, \ie, NoisyBoundary with 7.7 mAP, by a margin of
+over 12 points. Code is available at: \url{https://github.com/hujiecpp/PAIS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained building roof instance segmentation based on domain adapted
+  <span class="highlight-title">pretrain</span>ing and composite dual-backbone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guozhang Liu, Baochai Peng, Ting Liu, Pan Zhang, Mengke Yuan, Chaoran Lu, Ningning Cao, Sen Zhang, Simin Huang, Tao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diversity of building architecture styles of global cities situated on
+various landforms, the degraded optical imagery affected by clouds and shadows,
+and the significant inter-class imbalance of roof types pose challenges for
+designing a robust and accurate building roof instance segmentor. To address
+these issues, we propose an effective framework to fulfill semantic
+interpretation of individual buildings with high-resolution optical satellite
+imagery. Specifically, the leveraged domain adapted pretraining strategy and
+composite dual-backbone greatly facilitates the discriminative feature
+learning. Moreover, new data augmentation pipeline, stochastic weight averaging
+(SWA) training and instance segmentation based model ensemble in testing are
+utilized to acquire additional performance boost. Experiment results show that
+our approach ranks in the first place of the 2023 IEEE GRSS Data Fusion Contest
+(DFC) Track 1 test phase ($mAP_{50}$:50.6\%). Note-worthily, we have also
+explored the potential of multimodal data fusion with both optical satellite
+imagery and SAR data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TCSloT: Text Guided 3D Context and Slope Aware Triple Network for Dental
+  Implant Position Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinquan Yang, Jinheng Xie, Xuechen Li, Xuguang Li, Linlin Shen, Yongqiang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In implant prosthesis treatment, the surgical guide of implant is used to
+ensure accurate implantation. However, such design heavily relies on the manual
+location of the implant position. When deep neural network has been proposed to
+assist the dentist in locating the implant position, most of them take a single
+slice as input, which do not fully explore 3D contextual information and
+ignoring the influence of implant slope. In this paper, we design a Text Guided
+3D Context and Slope Aware Triple Network (TCSloT) which enables the perception
+of contextual information from multiple adjacent slices and awareness of
+variation of implant slopes. A Texture Variation Perception (TVP) module is
+correspondingly elaborated to process the multiple slices and capture the
+texture variation among slices and a Slope-Aware Loss (SAL) is proposed to
+dynamically assign varying weights for the regression head. Additionally, we
+design a conditional text guidance (CTG) module to integrate the text condition
+(i.e., left, middle and right) from the CLIP for assisting the implant position
+prediction. Extensive experiments on a dental implant dataset through five-fold
+cross-validation demonstrated that the proposed TCSloT achieves superior
+performance than existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards General and Fast Video Derain via Knowledge Distillation <span class="chip">ICME</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defang Cai, Pan Mu, Sixian Chan, Zhanpeng Shao, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a common natural weather condition, rain can obscure video frames and thus
+affect the performance of the visual system, so video derain receives a lot of
+attention. In natural environments, rain has a wide variety of streak types,
+which increases the difficulty of the rain removal task. In this paper, we
+propose a Rain Review-based General video derain Network via knowledge
+distillation (named RRGNet) that handles different rain streak types with one
+pre-training weight. Specifically, we design a frame grouping-based
+encoder-decoder network that makes full use of the temporal information of the
+video. Further, we use the old task model to guide the current model in
+learning new rain streak types while avoiding forgetting. To consolidate the
+network's ability to derain, we design a rain review module to play back data
+from old tasks for the current model. The experimental results show that our
+developed general method achieves the best results in terms of running speed
+and derain effect.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages; Accepted at IEEE ICME</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prostate Age Gap (PAG): An MRI surrogate marker of aging for prostate
+  cancer detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alvaro Fernandez-Quilez, Tobias Nordström, Fredrik Jäderling, Svein Reidar Kjosavik, Martin Eklund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Prostate cancer (PC) MRI-based risk calculators are commonly
+based on biological (e.g. PSA), MRI markers (e.g. volume), and patient age.
+Whilst patient age measures the amount of years an individual has existed,
+biological age (BA) might better reflect the physiology of an individual.
+However, surrogates from prostate MRI and linkage with clinically significant
+PC (csPC) remain to be explored. Purpose: To obtain and evaluate Prostate Age
+Gap (PAG) as an MRI marker tool for csPC risk. Study type: Retrospective.
+Population: A total of 7243 prostate MRI slices from 468 participants who had
+undergone prostate biopsies. A deep learning model was trained on 3223 MRI
+slices cropped around the gland from 81 low-grade PC (ncsPC, Gleason score <=6)
+and 131 negative cases and tested on the remaining 256 participants.
+Assessment: Chronological age was defined as the age of the participant at the
+time of the visit and used to train the deep learning model to predict the age
+of the patient. Following, we obtained PAG, defined as the model predicted age
+minus the patient's chronological age. Multivariate logistic regression models
+were used to estimate the association through odds ratio (OR) and predictive
+value of PAG and compared against PSA levels and PI-RADS>=3. Statistical tests:
+T-test, Mann-Whitney U test, Permutation test and ROC curve analysis. Results:
+The multivariate adjusted model showed a significant difference in the odds of
+clinically significant PC (csPC, Gleason score >=7) (OR =3.78, 95% confidence
+interval (CI):2.32-6.16, P <.001). PAG showed a better predictive ability when
+compared to PI-RADS>=3 and adjusted by other risk factors, including PSA
+levels: AUC =0.981 vs AUC =0.704, p<.001. Conclusion: PAG was significantly
+associated with the risk of clinically significant PC and outperformed other
+well-established PC risk factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adv-Inpainting: Generating Natural and Transferable Adversarial Patch
+  via Attention-guided Feature Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Li, Mingxing Duan, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rudimentary adversarial attacks utilize additive noise to attack facial
+recognition (FR) models. However, because manipulating the total face is
+impractical in the physical setting, most real-world FR attacks are based on
+adversarial patches, which limit perturbations to a small area. Previous
+adversarial patch attacks often resulted in unnatural patterns and clear
+boundaries that were easily noticeable. In this paper, we argue that generating
+adversarial patches with plausible content can result in stronger
+transferability than using additive noise or directly sampling from the latent
+space. To generate natural-looking and highly transferable adversarial patches,
+we propose an innovative two-stage coarse-to-fine attack framework called
+Adv-Inpainting. In the first stage, we propose an attention-guided StyleGAN
+(Att-StyleGAN) that adaptively combines texture and identity features based on
+the attention map to generate high-transferable and natural adversarial
+patches. In the second stage, we design a refinement network with a new
+boundary variance loss to further improve the coherence between the patch and
+its surrounding area. Experiment results demonstrate that Adv-Inpainting is
+stealthy and can produce adversarial patches with stronger transferability and
+improved visual quality than previous adversarial patch attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLSAC: Reinforcement Learning enhanced Sample Consensus for End-to-End
+  Robust Estimation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Nie, Guangming Wang, Zhe Liu, Luca Cavalli, Marc Pollefeys, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust estimation is a crucial and still challenging task, which involves
+estimating model parameters in noisy environments. Although conventional
+sampling consensus-based algorithms sample several times to achieve robustness,
+these algorithms cannot use data features and historical information
+effectively. In this paper, we propose RLSAC, a novel Reinforcement Learning
+enhanced SAmple Consensus framework for end-to-end robust estimation. RLSAC
+employs a graph neural network to utilize both data and memory features to
+guide exploring directions for sampling the next minimum set. The feedback of
+downstream tasks serves as the reward for unsupervised training. Therefore,
+RLSAC can avoid differentiating to learn the features and the feedback of
+downstream tasks for end-to-end robust estimation. In addition, RLSAC
+integrates a state transition module that encodes both data and memory
+features. Our experimental results demonstrate that RLSAC can learn from
+features to gradually explore a better hypothesis. Through analysis, it is
+apparent that RLSAC can be easily transferred to other sampling consensus-based
+robust estimation tasks. To the best of our knowledge, RLSAC is also the first
+method that uses reinforcement learning to sample consensus for end-to-end
+robust estimation. We release our codes at https://github.com/IRMVLab/RLSAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes are released at
+  https://github.com/IRMVLab/RLSAC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaocong Liu, Tao Wang, Yan Zhang, Ruqin Zhou, Li Li, Chenguang Dai, Yongsheng Zhang, Hanyun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current point cloud registration methods are mainly based on geometric
+information and usually ignore the semantic information in the point clouds. In
+this paper, we treat the point cloud registration problem as semantic instance
+matching and registration task, and propose a deep semantic graph matching
+method for large-scale outdoor point cloud registration. Firstly, the semantic
+category labels of 3D point clouds are obtained by utilizing large-scale point
+cloud semantic segmentation network. The adjacent points with the same category
+labels are then clustered together by using Euclidean clustering algorithm to
+obtain the semantic instances. Secondly, the semantic adjacency graph is
+constructed based on the spatial adjacency relation of semantic instances.
+Three kinds of high-dimensional features including geometric shape features,
+semantic categorical features and spatial distribution features are learned
+through graph convolutional network, and enhanced based on attention mechanism.
+Thirdly, the semantic instance matching problem is modeled as an optimal
+transport problem, and solved through an optimal matching layer. Finally,
+according to the matched semantic instances, the geometric transformation
+matrix between two point clouds is first obtained by SVD algorithm and then
+refined by ICP algorithm. The experiments are cconducted on the KITTI Odometry
+dataset, and the average relative translation error and average relative
+rotation error of the proposed method are 6.6cm and 0.229{\deg} respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DAOT: Domain-Agnostically Aligned Optimal Transport for Domain-Adaptive
+  Crowd Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huilin Zhu, Jingling Yuan, Xian Zhong, Zhengwei Yang, Zheng Wang, Shengfeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is commonly employed in crowd counting to bridge the domain
+gaps between different datasets. However, existing domain adaptation methods
+tend to focus on inter-dataset differences while overlooking the
+intra-differences within the same dataset, leading to additional learning
+ambiguities. These domain-agnostic factors, e.g., density, surveillance
+perspective, and scale, can cause significant in-domain variations, and the
+misalignment of these factors across domains can lead to a drop in performance
+in cross-domain crowd counting. To address this issue, we propose a
+Domain-agnostically Aligned Optimal Transport (DAOT) strategy that aligns
+domain-agnostic factors between domains. The DAOT consists of three steps.
+First, individual-level differences in domain-agnostic factors are measured
+using structural similarity (SSIM). Second, the optimal transfer (OT) strategy
+is employed to smooth out these differences and find the optimal
+domain-to-domain misalignment, with outlier individuals removed via a virtual
+"dustbin" column. Third, knowledge is transferred based on the aligned
+domain-agnostic factors, and the model is retrained for domain adaptation to
+bridge the gap across domains. We conduct extensive experiments on five
+standard crowd-counting benchmarks and demonstrate that the proposed method has
+strong generalizability across diverse datasets. Our code will be available at:
+https://github.com/HopooLinZ/DAOT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 12 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From CNN to <span class="highlight-title">Transformer</span>: A <span class="highlight-title">Review</span> of Medical Image Segmentation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjian Yao, Jiajun Bai, Wei Liao, Yuheng Chen, Mengjuan Liu, Yao Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is an important step in medical image analysis,
+especially as a crucial prerequisite for efficient disease diagnosis and
+treatment. The use of deep learning for image segmentation has become a
+prevalent trend. The widely adopted approach currently is U-Net and its
+variants. Additionally, with the remarkable success of pre-trained models in
+natural language processing tasks, transformer-based models like TransUNet have
+achieved desirable performance on multiple medical image segmentation datasets.
+In this paper, we conduct a survey of the most representative four medical
+image segmentation models in recent years. We theoretically analyze the
+characteristics of these models and quantitatively evaluate their performance
+on two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).
+Finally, we discuss the main challenges and future trends in medical image
+segmentation. Our work can assist researchers in the related field to quickly
+establish medical segmentation models tailored to specific regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Visual-Inertial System: Analysis,Calibration and Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulin Yang, Patrick Geneva, Guoquan Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study state estimation of multi-visual-inertial systems
+(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary
+number of asynchronous inertial measurement units (IMUs) or gyroscopes and
+global and(or) rolling shutter cameras. We are especially interested in the
+full calibration of the associated visual-inertial sensors, including the IMU
+or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as
+well as the image readout time of rolling-shutter cameras (if used). To this
+end, we develop a new analytic combined IMU integration with intrinsics-termed
+ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary
+IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial
+measurements to include all the necessary inertial intrinsic and IMU-IMU
+spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body
+constraints to eliminate the necessity of auxiliary inertial poses and thus
+reducing computational complexity. By performing observability analysis of
+MVIS, we prove that the standard four unobservable directions remain - no
+matter how many inertial sensors are used, and also identify, for the first
+time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary
+inertial intrinsics. In addition to the extensive simulations that validate our
+analysis and algorithms, we have built our own MVIS sensor rig and collected
+over 25 real-world datasets to experimentally verify the proposed calibration
+against the state-of-the-art calibration method such as Kalibr. We show that
+the proposed MVIS calibration is able to achieve competing accuracy with
+improved convergence and repeatability, which is open sourced to better benefit
+the community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Double-chain Constraints for 3D Human Pose Estimation in Images and
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Kang, Yong Wang, Mengyuan Liu, Doudou Wu, Peng Liu, Wenming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing 3D poses from 2D poses lacking depth information is
+particularly challenging due to the complexity and diversity of human motion.
+The key is to effectively model the spatial constraints between joints to
+leverage their inherent dependencies. Thus, we propose a novel model, called
+Double-chain Graph Convolutional Transformer (DC-GCT), to constrain the pose
+through a double-chain design consisting of local-to-global and global-to-local
+chains to obtain a complex representation more suitable for the current human
+pose. Specifically, we combine the advantages of GCN and Transformer and design
+a Local Constraint Module (LCM) based on GCN and a Global Constraint Module
+(GCM) based on self-attention mechanism as well as a Feature Interaction Module
+(FIM). The proposed method fully captures the multi-level dependencies between
+human body joints to optimize the modeling capability of the model. Moreover,
+we propose a method to use temporal information into the single-frame model by
+guiding the video sequence embedding through the joint embedding of the target
+frame, with negligible increase in computational cost. Experimental results
+demonstrate that DC-GCT achieves state-of-the-art performance on two
+challenging datasets (Human3.6M and MPI-INF-3DHP). Notably, our model achieves
+state-of-the-art performance on all action categories in the Human3.6M dataset
+using detected 2D poses from CPN, and our code is available at:
+https://github.com/KHB1698/DC-GCT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Informative Scene Graph Generation via Debiasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lianli Gao, Xinyu Lyu, Yuyu Guo, Yuxuan Hu, Yuan-Fang Li, Lu Xu, Heng Tao Shen, Jingkuan Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene graph generation aims to detect visual relationship triplets, (subject,
+predicate, object). Due to biases in data, current models tend to predict
+common predicates, e.g. "on" and "at", instead of informative ones, e.g.
+"standing on" and "looking at". This tendency results in the loss of precise
+information and overall performance. If a model only uses "stone on road"
+rather than "stone blocking road" to describe an image, it may be a grave
+misunderstanding. We argue that this phenomenon is caused by two imbalances:
+semantic space level imbalance and training sample level imbalance. For this
+problem, we propose DB-SGG, an effective framework based on debiasing but not
+the conventional distribution fitting. It integrates two components: Semantic
+Debiasing (SD) and Balanced Predicate Learning (BPL), for these imbalances. SD
+utilizes a confusion matrix and a bipartite graph to construct predicate
+relationships. BPL adopts a random undersampling strategy and an ambiguity
+removing strategy to focus on informative predicates. Benefiting from the
+model-agnostic process, our method can be easily applied to SGG models and
+outperforms Transformer by 136.3%, 119.5%, and 122.6% on mR@20 at three SGG
+sub-tasks on the SGG-VG dataset. Our method is further verified on another
+complex SGG dataset (SGG-GQA) and two downstream tasks (sentence-to-graph
+retrieval and image captioning).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2108.13129</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local-Global Information Interaction Debiasing for Dynamic Scene Graph
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Lyu, Jingwei Liu, Yuyu Guo, Lianli Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of dynamic scene graph generation (DynSGG) aims to generate scene
+graphs for given videos, which involves modeling the spatial-temporal
+information in the video. However, due to the long-tailed distribution of
+samples in the dataset, previous DynSGG models fail to predict the tail
+predicates. We argue that this phenomenon is due to previous methods that only
+pay attention to the local spatial-temporal information and neglect the
+consistency of multiple frames. To solve this problem, we propose a novel
+DynSGG model based on multi-task learning, DynSGG-MTL, which introduces the
+local interaction information and global human-action interaction information.
+The interaction between objects and frame features makes the model more fully
+understand the visual context of the single image. Long-temporal human actions
+supervise the model to generate multiple scene graphs that conform to the
+global constraints and avoid the model being unable to learn the tail
+predicates. Extensive experiments on Action Genome dataset demonstrate the
+efficacy of our proposed framework, which not only improves the dynamic scene
+graph generation but also alleviates the long-tail problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TrainFors: A Large Benchmark Training <span class="highlight-title">Dataset</span> for Image Manipulation
+  Detection and Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumyaroop Nandi, Prem Natarajan, Wael Abd-Almageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation datasets and metrics for image manipulation detection and
+localization (IMDL) research have been standardized. But the training dataset
+for such a task is still nonstandard. Previous researchers have used
+unconventional and deviating datasets to train neural networks for detecting
+image forgeries and localizing pixel maps of manipulated regions. For a fair
+comparison, the training set, test set, and evaluation metrics should be
+persistent. Hence, comparing the existing methods may not seem fair as the
+results depend heavily on the training datasets as well as the model
+architecture. Moreover, none of the previous works release the synthetic
+training dataset used for the IMDL task. We propose a standardized benchmark
+training dataset for image splicing, copy-move forgery, removal forgery, and
+image enhancement forgery. Furthermore, we identify the problems with the
+existing IMDL datasets and propose the required modifications. We also train
+the state-of-the-art IMDL methods on our proposed TrainFors1 dataset for a fair
+evaluation and report the actual performance of these methods under similar
+conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aphid Cluster Recognition and Detection in the Wild Using Deep Learning
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiao Zhang, Kaidong Li, Xiangyu Chen, Cuncong Zhong, Bo Luo, Ivan Grijalva, Brian McCornack, Daniel Flippo, Ajay Sharda, Guanghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aphid infestation poses a significant threat to crop production, rural
+communities, and global food security. While chemical pest control is crucial
+for maximizing yields, applying chemicals across entire fields is both
+environmentally unsustainable and costly. Hence, precise localization and
+management of aphids are essential for targeted pesticide application. The
+paper primarily focuses on using deep learning models for detecting aphid
+clusters. We propose a novel approach for estimating infection levels by
+detecting aphid clusters. To facilitate this research, we have captured a
+large-scale dataset from sorghum fields, manually selected 5,447 images
+containing aphids, and annotated each individual aphid cluster within these
+images. To facilitate the use of machine learning models, we further process
+the images by cropping them into patches, resulting in a labeled dataset
+comprising 151,380 image patches. Then, we implemented and compared the
+performance of four state-of-the-art object detection models (VFNet, GFLV2,
+PAA, and ATSS) on the aphid dataset. Extensive experimental results show that
+all models yield stable similar performance in terms of average precision and
+recall. We then propose to merge close neighboring clusters and remove tiny
+clusters caused by cropping, and the performance is further boosted by around
+17%. The study demonstrates the feasibility of automatically detecting and
+managing insects using machine learning models. The labeled dataset will be
+made openly available to the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Shang, Yanli Liu, Zhengyang Lou, Shuxue Quan, Nagesh Adluru, Bochen Guan, William A. Sethares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) and vision transformers (ViTs) have
+achieved remarkable success in various vision tasks. However, many
+architectures do not consider interactions between feature maps from different
+stages and scales, which may limit their performance. In this work, we propose
+a simple add-on attention module to overcome these limitations via multi-stage
+and cross-scale interactions. Specifically, the proposed Multi-Stage
+Cross-Scale Attention (\meth) module takes feature maps from different stages
+to enable multi-stage interactions and achieves cross-scale interactions by
+computing self-attention at different scales based on the multi-stage feature
+maps. Our experiments on several downstream tasks show that \meth~provides a
+significant performance boost with modest additional FLOPs and runtime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal
+  Solutions <span class="chip">NeurIPS22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Ma, Ronald Xie, Shamini Ayyadhury, Cheng Ge, Anubha Gupta, Ritu Gupta, Song Gu, Yao Zhang, Gihun Lee, Joonkee Kim, Wei Lou, Haofeng Li, Eric Upschulte, Timo Dickscheid, José Guilherme de Almeida, Yixin Wang, Lin Han, Xin Yang, Marco Labagnara, Sahand Jamal Rahi, Carly Kempster, Alice Pollitt, Leon Espinosa, Tâm Mignot, Jan Moritz Middeke, Jan-Niklas Eckardt, Wangkai Li, Zhaoyang Li, Xiaochen Cai, Bizhe Bai, Noah F. Greenwald, David Van Valen, Erin Weisbart, Beth A. Cimini, Zhuoshi Li, Chao Zuo, Oscar Brück, Gary D. Bader, Bo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell segmentation is a critical step for quantitative single-cell analysis in
+microscopy images. Existing cell segmentation methods are often tailored to
+specific modalities or require manual interventions to specify hyperparameters
+in different experimental settings. Here, we present a multi-modality cell
+segmentation benchmark, comprising over 1500 labeled images derived from more
+than 50 diverse biological experiments. The top participants developed a
+Transformer-based deep-learning algorithm that not only exceeds existing
+methods, but can also be applied to diverse microscopy images across imaging
+platforms and tissue types without manual parameter adjustments. This benchmark
+and the improved algorithm offer promising avenues for more accurate and
+versatile cell analysis in microscopy imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS22 Cell Segmentation Challenge:
+  https://neurips22-cellseg.grand-challenge.org/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unleashing the Strengths of Unlabeled Data in Pan-cancer Abdominal Organ
+  Quantification: the FLARE22 Challenge <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Ma, Yao Zhang, Song Gu, Cheng Ge, Shihao Ma, Adamo Young, Cheng Zhu, Kangkang Meng, Xin Yang, Ziyan Huang, Fan Zhang, Wentao Liu, YuanKe Pan, Shoujin Huang, Jiacheng Wang, Mingze Sun, Weixin Xu, Dengqiang Jia, Jae Won Choi, Natália Alves, Bram de Wilde, Gregor Koehler, Yajun Wu, Manuel Wiesenfarth, Qiongjie Zhu, Guoqiang Dong, Jian He, the FLARE Challenge Consortium, Bo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantitative organ assessment is an essential step in automated abdominal
+disease diagnosis and treatment planning. Artificial intelligence (AI) has
+shown great potential to automatize this process. However, most existing AI
+algorithms rely on many expert annotations and lack a comprehensive evaluation
+of accuracy and efficiency in real-world multinational settings. To overcome
+these limitations, we organized the FLARE 2022 Challenge, the largest abdominal
+organ analysis challenge to date, to benchmark fast, low-resource, accurate,
+annotation-efficient, and generalized AI algorithms. We constructed an
+intercontinental and multinational dataset from more than 50 medical groups,
+including Computed Tomography (CT) scans with different races, diseases,
+phases, and manufacturers. We independently validated that a set of AI
+algorithms achieved a median Dice Similarity Coefficient (DSC) of 90.0\% by
+using 50 labeled scans and 2000 unlabeled scans, which can significantly reduce
+annotation requirements. The best-performing algorithms successfully
+generalized to holdout external validation sets, achieving a median DSC of
+89.5\%, 90.9\%, and 88.3\% on North American, European, and Asian cohorts,
+respectively. They also enabled automatic extraction of key organ biology
+features, which was labor-intensive with traditional manual measurements. This
+opens the potential to use unlabeled data to boost performance and alleviate
+annotation shortages for modern AI models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI FLARE22: https://flare22.grand-challenge.org/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegDA: Maximum Separable Segment Mask with Pseudo Labels for Domain
+  Adaptive Semantic Segmentation <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anant Khandelwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation (UDA) aims to solve the problem of label
+scarcity of the target domain by transferring the knowledge from the label rich
+source domain. Usually, the source domain consists of synthetic images for
+which the annotation is easily obtained using the well known computer graphics
+techniques. However, obtaining annotation for real world images (target domain)
+require lot of manual annotation effort and is very time consuming because it
+requires per pixel annotation. To address this problem we propose SegDA module
+to enhance transfer performance of UDA methods by learning the maximum
+separable segment representation. This resolves the problem of identifying
+visually similar classes like pedestrian/rider, sidewalk/road etc. We leveraged
+Equiangular Tight Frame (ETF) classifier inspired from Neural Collapse for
+maximal separation between segment classes. This causes the source domain pixel
+representation to collapse to a single vector forming a simplex vertices which
+are aligned to the maximal separable ETF classifier. We use this phenomenon to
+propose the novel architecture for domain adaptation of segment representation
+for target domain. Additionally, we proposed to estimate the noise in labelling
+the target domain images and update the decoder for noise correction which
+encourages the discovery of pixels for classes not identified in pseudo labels.
+We have used four UDA benchmarks simulating synthetic-to-real,
+daytime-to-nighttime, clear-to-adverse weather scenarios. Our proposed approach
+outperforms +2.2 mIoU on GTA -> Cityscapes, +2.0 mIoU on Synthia -> Cityscapes,
++5.9 mIoU on Cityscapes -> DarkZurich, +2.6 mIoU on Cityscapes -> ACDC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 Tables, 3 Figures, accepted at ICCVW 2023 (ICCV 2023: 4th
+  Workshop on Visual Perception for Navigation in Human Environments)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seed Kernel Counting using Domain Randomization and Object Tracking
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Venkat Margapuri, Prapti Thapaliya, Mitchell Neilsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-throughput phenotyping (HTP) of seeds, also known as seed phenotyping,
+is the comprehensive assessment of complex seed traits such as growth,
+development, tolerance, resistance, ecology, yield, and the measurement of
+parameters that form more complex traits. One of the key aspects of seed
+phenotyping is cereal yield estimation that the seed production industry relies
+upon to conduct their business. While mechanized seed kernel counters are
+available in the market currently, they are often priced high and sometimes
+outside the range of small scale seed production firms' affordability. The
+development of object tracking neural network models such as You Only Look Once
+(YOLO) enables computer scientists to design algorithms that can estimate
+cereal yield inexpensively. The key bottleneck with neural network models is
+that they require a plethora of labelled training data before they can be put
+to task. We demonstrate that the use of synthetic imagery serves as a feasible
+substitute to train neural networks for object tracking that includes the tasks
+of object classification and detection. Furthermore, we propose a seed kernel
+counter that uses a low-cost mechanical hopper, trained YOLOv8 neural network
+model, and object tracking algorithms on StrongSORT and ByteTrack to estimate
+cereal yield from videos. The experiment yields a seed kernel count with an
+accuracy of 95.2\% and 93.2\% for Soy and Wheat respectively using the
+StrongSORT algorithm, and an accuray of 96.8\% and 92.4\% for Soy and Wheat
+respectively using the ByteTrack algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Encode-Store-Retrieve: Enhancing Memory Augmentation through
+  Language-Encoded Egocentric Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiao Shen, John Dudley, Per Ola Kristensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We depend on our own memory to encode, store, and retrieve our experiences.
+However, memory lapses can occur. One promising avenue for achieving memory
+augmentation is through the use of augmented reality head-mounted displays to
+capture and preserve egocentric videos, a practice commonly referred to as life
+logging. However, a significant challenge arises from the sheer volume of video
+data generated through life logging, as the current technology lacks the
+capability to encode and store such large amounts of data efficiently. Further,
+retrieving specific information from extensive video archives requires
+substantial computational power, further complicating the task of quickly
+accessing desired content. To address these challenges, we propose a memory
+augmentation system that involves leveraging natural language encoding for
+video data and storing them in a vector database. This approach harnesses the
+power of large vision language models to perform the language encoding process.
+Additionally, we propose using large language models to facilitate natural
+language querying. Our system underwent extensive evaluation using the QA-Ego4D
+dataset and achieved state-of-the-art results with a BLEU score of 8.3,
+outperforming conventional machine learning models that scored between 3.4 and
+5.8. Additionally, in a user study, our system received a higher mean response
+score of 4.13/5 compared to the human participants' score of 2.46/5 on
+real-life episodic memory tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recognizing Handwritten Mathematical Expressions of Vertical Addition
+  and Subtraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Rosa, Filipe R. Cordeiro, Ruan Carvalho, Everton Souza, Sergio Chevtchenko, Luiz Rodrigues, Marcelo Marinho, Thales Vieira, Valmir Macario
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handwritten Mathematical Expression Recognition (HMER) is a challenging task
+with many educational applications. Recent methods for HMER have been developed
+for complex mathematical expressions in standard horizontal format. However,
+solutions for elementary mathematical expression, such as vertical addition and
+subtraction, have not been explored in the literature. This work proposes a new
+handwritten elementary mathematical expression dataset composed of addition and
+subtraction expressions in a vertical format. We also extended the MNIST
+dataset to generate artificial images with this structure. Furthermore, we
+proposed a solution for offline HMER, able to recognize vertical addition and
+subtraction expressions. Our analysis evaluated the object detection algorithms
+YOLO v7, YOLO v8, YOLO-NAS, NanoDet and FCOS for identifying the mathematical
+symbols. We also proposed a transcription method to map the bounding boxes from
+the object detection stage to a mathematical expression in the LATEX markup
+sequence. Results show that our approach is efficient, achieving a high
+expression recognition rate. The code and dataset are available at
+https://github.com/Danielgol/HME-VAS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at SIBGRAPI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Absorption-Based, Passive Range Imaging from Hyperspectral Thermal
+  Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Unay Dorken Gallastegi, Hoover Rueda-Chacon, Martin J. Stevens, Vivek K Goyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Passive hyperspectral long-wave infrared measurements are remarkably
+informative about the surroundings, such as remote object material composition,
+temperature, and range; and air temperature and gas concentrations. Remote
+object material and temperature determine the spectrum of thermal radiance, and
+range, air temperature, and gas concentrations determine how this spectrum is
+modified by propagation to the sensor. We computationally separate these
+phenomena, introducing a novel passive range imaging method based on
+atmospheric absorption of ambient thermal radiance. Previously demonstrated
+passive absorption-based ranging methods assume hot and highly emitting
+objects. However, the temperature variation in natural scenes is usually low,
+making range imaging challenging. Our method benefits from explicit
+consideration of air emission and parametric modeling of atmospheric
+absorption. To mitigate noise in low-contrast scenarios, we jointly estimate
+range and intrinsic object properties by exploiting a variety of absorption
+lines spread over the infrared spectrum. Along with Monte Carlo simulations
+that demonstrate the importance of regularization, temperature differentials,
+and availability of many spectral bands, we apply this method to long-wave
+infrared (8--13 $\mu$m) hyperspectral image data acquired from natural scenes
+with no active illumination. Range features from 15m to 150m are recovered,
+with good qualitative match to unaligned lidar data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spintronics for image recognition : performance benchmarking via
+  ultrafast data-driven simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anatole Moureaux, Chloé Chopin, Laurent Jacques, Flavio Abreu Araujo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a demonstration of image classification using a hardware-based
+echo-state network (ESN) that relies on spintronic nanostructures known as
+vortex-based spin-torque oscillators (STVOs). Our network is realized using a
+single STVO multiplexed in time. To circumvent the challenges associated with
+repeated experimental manipulation of such a nanostructured system, we employ
+an ultrafast data-driven simulation framework called the data-driven Thiele
+equation approach (DD-TEA) to simulate the STVO dynamics. We use this approach
+to efficiently develop, optimize and test an STVO-based ESN for image
+classification using the MNIST dataset. We showcase the versatility of our
+solution by successfully applying it to solve classification challenges with
+the EMNIST-letters and Fashion MNIST datasets. Through our simulations, we
+determine that within a large ESN the results obtained using the STVO dynamics
+as an activation function are comparable to the ones obtained with other
+conventional nonlinear activation functions like the reLU and the sigmoid.
+While achieving state-of-the-art accuracy levels on the MNIST dataset, our
+model's performance on EMNIST-letters and Fashion MNIST is lower due to the
+relative simplicity of the system architecture and the increased complexity of
+the tasks. We expect that the DD-TEA framework will enable the exploration of
+more specialized neural architectures, ultimately leading to improved
+classification accuracy. This approach also holds promise for investigating and
+developing dedicated learning rules to further enhance classification
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporally-Adaptive Models for Efficient Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Huang, Shiwei Zhang, Liang Pan, Zhiwu Qing, Yingya Zhang, Ziwei Liu, Marcelo H. Ang Jr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial convolutions are extensively used in numerous deep video models. It
+fundamentally assumes spatio-temporal invariance, i.e., using shared weights
+for every location in different frames. This work presents Temporally-Adaptive
+Convolutions (TAdaConv) for video understanding, which shows that adaptive
+weight calibration along the temporal dimension is an efficient way to
+facilitate modeling complex temporal dynamics in videos. Specifically, TAdaConv
+empowers spatial convolutions with temporal modeling abilities by calibrating
+the convolution weights for each frame according to its local and global
+temporal context. Compared to existing operations for temporal modeling,
+TAdaConv is more efficient as it operates over the convolution kernels instead
+of the features, whose dimension is an order of magnitude smaller than the
+spatial resolutions. Further, kernel calibration brings an increased model
+capacity. Based on this readily plug-in operation TAdaConv as well as its
+extension, i.e., TAdaConvV2, we construct TAdaBlocks to empower ConvNeXt and
+Vision Transformer to have strong temporal modeling capabilities. Empirical
+results show TAdaConvNeXtV2 and TAdaFormer perform competitively against
+state-of-the-art convolutional and Transformer-based models in various video
+understanding benchmarks. Our codes and models are released at:
+https://github.com/alibaba-mmai-research/TAdaConv.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2110.06178</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leverage Weakly Annotation to Pixel-wise Annotation via Zero-shot
+  Segment Anything Model for Molecular-empowered Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyuan Li, Ruining Deng, Yucheng Tang, Shunxing Bao, Haichun Yang, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise identification of multiple cell classes in high-resolution Giga-pixel
+whole slide imaging (WSI) is critical for various clinical scenarios. Building
+an AI model for this purpose typically requires pixel-level annotations, which
+are often unscalable and must be done by skilled domain experts (e.g.,
+pathologists). However, these annotations can be prone to errors, especially
+when distinguishing between intricate cell types (e.g., podocytes and mesangial
+cells) using only visual inspection. Interestingly, a recent study showed that
+lay annotators, when using extra immunofluorescence (IF) images for reference
+(referred to as molecular-empowered learning), can sometimes outperform domain
+experts in labeling. Despite this, the resource-intensive task of manual
+delineation remains a necessity during the annotation process. In this paper,
+we explore the potential of bypassing pixel-level delineation by employing the
+recent segment anything model (SAM) on weak box annotation in a zero-shot
+learning approach. Specifically, we harness SAM's ability to produce
+pixel-level annotations from box annotations and utilize these SAM-generated
+labels to train a segmentation model. Our findings show that the proposed
+SAM-assisted molecular-empowered learning (SAM-L) can diminish the labeling
+efforts for lay annotators by only requiring weak box annotations. This is
+achieved without compromising annotation accuracy or the performance of the
+deep learning-based segmentation. This research represents a significant
+advancement in democratizing the annotation process for training pathological
+image segmentation, relying solely on non-expert annotators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-performance Data Management for Whole Slide Image Analysis in
+  Digital Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoju Leng, Ruining Deng, Shunxing Bao, Dazheng Fang, Bryan A. Millis, Yucheng Tang, Haichun Yang, Lipeng Wan, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When dealing with giga-pixel digital pathology in whole-slide imaging, a
+notable proportion of data records holds relevance during each analysis
+operation. For instance, when deploying an image analysis algorithm on
+whole-slide images (WSI), the computational bottleneck often lies in the
+input-output (I/O) system. This is particularly notable as patch-level
+processing introduces a considerable I/O load onto the computer system.
+However, this data management process can be potentially further paralleled,
+given the typical independence of patch-level image processes across different
+patches. This paper details our endeavors in tackling this data access
+challenge through the implementation of the Adaptable IO System version 2
+(ADIOS2). Our focus has been on constructing and releasing a digital
+pathology-centric pipeline using ADIOS2, which facilitates streamlined data
+management across WSIs. Additionally, we've developed strategies aimed at
+curtailing data retrieval times. The performance evaluation encompasses two key
+scenarios: (1) a pure CPU-based image analysis scenario (termed the "CPU
+scenario"), and (2) a GPU-based deep learning framework scenario (referred to
+as the "GPU scenario"). Our findings reveal noteworthy outcomes. Under the CPU
+scenario, ADIOS2 showcases an impressive two-fold speed-up in comparison to the
+brute-force approach. In the GPU scenario, its performance stands on par with
+the cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct
+Storage (GDS). From what we know, this appears to be among the initial
+instances, if any, of utilizing ADIOS2 within the field of digital pathology.
+The source code has been made publicly available at
+https://github.com/hrlblab/adios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-scale Multi-site Renal Microvascular Structures Segmentation for
+  Whole Slide Imaging in Renal Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franklin Hu, Ruining Deng, Shunxing Bao, Haichun Yang, Yuankai Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation of microvascular structures, such as arterioles, venules, and
+capillaries, from human kidney whole slide images (WSI) has become a focal
+point in renal pathology. Current manual segmentation techniques are
+time-consuming and not feasible for large-scale digital pathology images. While
+deep learning-based methods offer a solution for automatic segmentation, most
+suffer from a limitation: they are designed for and restricted to training on
+single-site, single-scale data. In this paper, we present Omni-Seg, a novel
+single dynamic network method that capitalizes on multi-site, multi-scale
+training data. Unique to our approach, we utilize partially labeled images,
+where only one tissue type is labeled per training image, to segment
+microvascular structures. We train a singular deep network using images from
+two datasets, HuBMAP and NEPTUNE, across different magnifications (40x, 20x,
+10x, and 5x). Experimental results indicate that Omni-Seg outperforms in terms
+of both the Dice Similarity Coefficient (DSC) and Intersection over Union
+(IoU). Our proposed method provides renal pathologists with a powerful
+computational tool for the quantitative analysis of renal microvascular
+structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InFusion: Inject and Attention Fusion for Multi Concept Zero-Shot
+  Text-based Video Editing <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00135v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00135v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anant Khandelwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large text-to-image diffusion models have achieved remarkable success in
+generating diverse, high-quality images. Additionally, these models have been
+successfully leveraged to edit input images by just changing the text prompt.
+But when these models are applied to videos, the main challenge is to ensure
+temporal consistency and coherence across frames. In this paper, we propose
+InFusion, a framework for zero-shot text-based video editing leveraging large
+pre-trained image diffusion models. Our framework specifically supports editing
+of multiple concepts with pixel-level control over diverse concepts mentioned
+in the editing prompt. Specifically, we inject the difference in features
+obtained with source and edit prompts from U-Net residual blocks of decoder
+layers. When these are combined with injected attention features, it becomes
+feasible to query the source contents and scale edited concepts along with the
+injection of unedited parts. The editing is further controlled in a
+fine-grained manner with mask extraction and attention fusion, which cut the
+edited part from the source and paste it into the denoising pipeline for the
+editing prompt. Our framework is a low-cost alternative to one-shot tuned
+models for editing since it does not require training. We demonstrated complex
+concept editing with a generalised image model (Stable Diffusion v1.5) using
+LoRA. Adaptation is compatible with all the existing image diffusion
+techniques. Extensive experimental results demonstrate the effectiveness of
+existing methods in rendering high-quality and temporally consistent videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 1 Table, accepted at ICCVW 2023 (ICCV 2023
+  Workshop on AI for Creative Video Editing and Understanding)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Denoised Smoothing for Certified and Adversarial Robust
+  Out-Of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14961v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14961v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Franco, Daniel Korth, Jeanette Miriam Lorenz, Karsten Roscher, Stephan Guennemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of machine learning continues to expand, the importance of
+ensuring its safety cannot be overstated. A key concern in this regard is the
+ability to identify whether a given sample is from the training distribution,
+or is an "Out-Of-Distribution" (OOD) sample. In addition, adversaries can
+manipulate OOD samples in ways that lead a classifier to make a confident
+prediction. In this study, we present a novel approach for certifying the
+robustness of OOD detection within a $\ell_2$-norm around the input, regardless
+of network architecture and without the need for specific components or
+additional training. Further, we improve current techniques for detecting
+adversarial attacks on OOD samples, while providing high levels of certified
+and adversarial robustness on in-distribution samples. The average of all OOD
+detection metrics on CIFAR10/100 shows an increase of $\sim 13 \% / 5\%$
+relative to previous approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STHG: Spatial-Temporal Heterogeneous Graph Learning for Advanced
+  Audio-Visual Diarization <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report introduces our novel method named STHG for the Audio-Visual
+Diarization task of the Ego4D Challenge 2023. Our key innovation is that we
+model all the speakers in a video using a single, unified heterogeneous graph
+learning framework. Unlike previous approaches that require a separate
+component solely for the camera wearer, STHG can jointly detect the speech
+activities of all people including the camera wearer. Our final method obtains
+61.1% DER on the test set of Ego4D, which significantly outperforms all the
+baselines as well as last year's winner. Our submission achieved 1st place in
+the Ego4D Challenge 2023. We additionally demonstrate that applying the
+off-the-shelf speech recognition system to the diarized speech segments by STHG
+produces a competitive performance on the Speech Transcription task of this
+challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Validation report for the Ego4D challenge at CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intel Labs at Ego4D Challenge 2022: A Better Baseline for Audio-Visual
+  Diarization <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report describes our approach for the Audio-Visual Diarization (AVD)
+task of the Ego4D Challenge 2022. Specifically, we present multiple technical
+improvements over the official baselines. First, we improve the detection
+performance of the camera wearer's voice activity by modifying the training
+scheme of its model. Second, we discover that an off-the-shelf voice activity
+detection model can effectively remove false positives when it is applied
+solely to the camera wearer's voice activities. Lastly, we show that better
+active speaker detection leads to a better AVD outcome. Our final method
+obtains 65.9% DER on the test set of Ego4D, which significantly outperforms all
+the baselines. Our submission achieved 1st place in the Ego4D Challenge 2022.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Validation report for the Ego4D challenge at ECCV 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach roughly
+human-level accuracy on ImageNet. Human-level competence is thus achievable for
+a fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 adds an Appendix containing results with alternative scaling
+  functions; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-vocabulary Object Segmentation with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Li, Qinye Zhou, Xiaoyun Zhang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of this paper is to extract the visual-language correspondence from
+a pre-trained text-to-image diffusion model, in the form of segmentation map,
+i.e., simultaneously generating images and segmentation masks for the
+corresponding visual entities described in the text prompt. We make the
+following contributions: (i) we pair the existing Stable Diffusion model with a
+novel grounding module, that can be trained to align the visual and textual
+embedding space of the diffusion model with only a small number of object
+categories; (ii) we establish an automatic pipeline for constructing a dataset,
+that consists of {image, segmentation mask, text prompt} triplets, to train the
+proposed grounding module; (iii) we evaluate the performance of open-vocabulary
+grounding on images generated from the text-to-image diffusion model and show
+that the module can well segment the objects of categories beyond seen ones at
+training time; (iv) we adopt the augmented diffusion model to build a synthetic
+semantic segmentation dataset, and show that, training a standard segmentation
+model on such dataset demonstrates competitive performance on the zero-shot
+segmentation(ZS3) benchmark, which opens up new opportunities for adopting the
+powerful diffusion model for discriminative tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUNNEL: Guided Mixup Augmentation and Multi-View Fusion for Aquatic
+  Animal Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.06193v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.06193v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh-Quan Le, Trung-Nghia Le, Tam V. Nguyen, Isao Echizen, Minh-Triet Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed great advances in object segmentation research.
+In addition to generic objects, aquatic animals have attracted research
+attention. Deep learning-based methods are widely used for aquatic animal
+segmentation and have achieved promising performance. However, there is a lack
+of challenging datasets for benchmarking. In this work, we build a new dataset
+dubbed Aquatic Animal Species. We also devise a novel GUided mixup augmeNtatioN
+and multi-modEl fusion for aquatic animaL segmentation (GUNNEL) that leverages
+the advantages of multiple segmentation models to effectively segment aquatic
+animals and improves the training performance by synthesizing hard samples.
+Extensive experiments demonstrated the superiority of our proposed framework
+over existing state-of-the-art instance segmentation methods. The code is
+available at https://github.com/lmquan2000/mask-mixup. The dataset is available
+at https://doi.org/10.5281/zenodo.8208877 .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/lmquan2000/mask-mixup .
+  The dataset is available at https://doi.org/10.5281/zenodo.8208877</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.02399v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.02399v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longtian Qiu, Renrui Zhang, Ziyu Guo, Ziyao Zeng, Zilu Guo, Yafeng Li, Guangnan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention
+recently for its transferable visual representation learning. However, due to
+the semantic gap within datasets, CLIP's pre-trained image-text alignment
+becomes sub-optimal on downstream tasks, which severely harms its transferring
+performance. To better adapt the cross-modality embedding space, we propose to
+enhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide
+textual features of different categories to adaptively explore informative
+regions on the image and aggregate visual features by attention mechanisms. In
+this way, the texts become visual-guided, namely, more semantically correlated
+with downstream images, which greatly benefits the category-wise matching
+process. In few-shot settings, we evaluate our VT-CLIP on 11 well-known
+classification datasets to demonstrate its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep learning-based Crop Row Detection for Infield Navigation of
+  Agri-Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajitha de Silva, Grzegorz Cielniak, Gang Wang, Junfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in agricultural environments is challenged by varying
+field conditions that arise in arable fields. State-of-the-art solutions for
+autonomous navigation in such environments require expensive hardware such as
+RTK-GNSS. This paper presents a robust crop row detection algorithm that
+withstands such field variations using inexpensive cameras. Existing datasets
+for crop row detection does not represent all the possible field variations. A
+dataset of sugar beet images was created representing 11 field variations
+comprised of multiple grow stages, light levels, varying weed densities, curved
+crop rows and discontinuous crop rows. The proposed pipeline segments the crop
+rows using a deep learning-based method and employs the predicted segmentation
+mask for extraction of the central crop using a novel central crop row
+selection algorithm. The novel crop row detection algorithm was tested for crop
+row detection performance and the capability of visual servoing along a crop
+row. The visual servoing-based navigation was tested on a realistic simulation
+scenario with the real ground and plant textures. Our algorithm demonstrated
+robust vision-based crop row detection in challenging field conditions
+outperforming the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Journal of Field Robotics:
+  https://onlinelibrary.wiley.com/doi/epdf/10.1002/rob.22238</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Multiview Clustering by Contrasting Cluster Assignments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10769v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10769v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Chen, Hua Mao, Wai Lok Woo, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiview clustering (MVC) aims to reveal the underlying structure of
+multiview data by categorizing data samples into clusters. Deep learning-based
+methods exhibit strong feature learning capabilities on large-scale datasets.
+For most existing deep MVC methods, exploring the invariant representations of
+multiple views is still an intractable problem. In this paper, we propose a
+cross-view contrastive learning (CVCL) method that learns view-invariant
+representations and produces clustering results by contrasting the cluster
+assignments among multiple views. Specifically, we first employ deep
+autoencoders to extract view-dependent features in the pretraining stage. Then,
+a cluster-level CVCL strategy is presented to explore consistent semantic label
+information among the multiple views in the fine-tuning stage. Thus, the
+proposed CVCL method is able to produce more discriminative cluster assignments
+by virtue of this learning strategy. Moreover, we provide a theoretical
+analysis of soft cluster assignment alignment. Extensive experimental results
+obtained on several datasets demonstrate that the proposed CVCL method
+outperforms several state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symmetry Defense Against CNN Adversarial Perturbation Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Blerta Lindqvist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper uses symmetry to make Convolutional Neural Network classifiers
+(CNNs) robust against adversarial perturbation attacks. Such attacks add
+perturbation to original images to generate adversarial images that fool
+classifiers such as road sign classifiers of autonomous vehicles. Although
+symmetry is a pervasive aspect of the natural world, CNNs are unable to handle
+symmetry well. For example, a CNN can classify an image differently from its
+mirror image. For an adversarial image that misclassifies with a wrong label
+$l_w$, CNN inability to handle symmetry means that a symmetric adversarial
+image can classify differently from the wrong label $l_w$. Further than that,
+we find that the classification of a symmetric adversarial image reverts to the
+correct label. To classify an image when adversaries are unaware of the
+defense, we apply symmetry to the image and use the classification label of the
+symmetric image. To classify an image when adversaries are aware of the
+defense, we use mirror symmetry and pixel inversion symmetry to form a symmetry
+group. We apply all the group symmetries to the image and decide on the output
+label based on the agreement of any two of the classification labels of the
+symmetry images. Adaptive attacks fail because they need to rely on loss
+functions that use conflicting CNN output values for symmetric images. Without
+attack knowledge, the proposed symmetry defense succeeds against both
+gradient-based and random-search attacks, with up to near-default accuracies
+for ImageNet. The defense even improves the classification accuracy of original
+images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Diffusion Models with Explicit Transition Probability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13720v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13720v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Huang, Zheng Qin, Xinwang Liu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities
+of generated content, however, they often suffer from complex forward
+processes, resulting in inefficient solutions for the reversed process and
+prolonged sampling times. In this paper, we aim to address the aforementioned
+challenges by focusing on the diffusion process itself that we propose to
+decouple the intricate diffusion process into two comparatively simpler process
+to improve the generative efficacy and speed. In particular, we present a novel
+diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito
+diffusion process, in which the image distribution is approximated by an
+explicit transition probability while the noise path is controlled by the
+standard Wiener process. We find that decoupling the diffusion process reduces
+the learning difficulty and the explicit transition probability improves the
+generative speed significantly. We prove a new training objective for DPM,
+which enables the model to learn to predict the noise and image components
+separately. Moreover, given the novel forward diffusion equation, we derive the
+reverse denoising formula of DDM that naturally supports fewer steps of
+generation without ordinary differential equation (ODE) based accelerators. Our
+experiments demonstrate that DDM outperforms previous DPMs by a large margin in
+fewer function evaluations setting and gets comparable performances in long
+function evaluations setting. We also show that our framework can be applied to
+image-conditioned generation and high-resolution image synthesis, and that it
+can generate high-quality images with only 10 function evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained
+  Diffusion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10816v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10816v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinheng Xie, Yuexiang Li, Yawen Huang, Haozhe Liu, Wentian Zhang, Yefeng Zheng, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image diffusion models have demonstrated an astonishing
+capacity to generate high-quality images. However, researchers mainly studied
+the way of synthesizing images with only text prompts. While some works have
+explored using other modalities as conditions, considerable paired data, e.g.,
+box/mask-image pairs, and fine-tuning time are required for nurturing models.
+As such paired data is time-consuming and labor-intensive to acquire and
+restricted to a closed set, this potentially becomes the bottleneck for
+applications in an open world. This paper focuses on the simplest form of
+user-provided conditions, e.g., box or scribble. To mitigate the aforementioned
+problem, we propose a training-free method to control objects and contexts in
+the synthesized images adhering to the given spatial conditions. Specifically,
+three spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,
+are designed and seamlessly integrated into the denoising step of diffusion
+models, requiring no additional training and massive annotated layout data.
+Extensive results show that the proposed constraints can control what and where
+to present in the images while retaining the ability of the Stable Diffusion
+model to synthesize with high fidelity and diverse concept coverage. The code
+is publicly available at https://github.com/Sierkinhane/BoxDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code is available at:
+  https://github.com/Sierkinhane/BoxDiff</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04868v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04868v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Canela, Pol Caselles, Ibrar Malik, Eduard Ramon, Jaime García, Jordi Sánchez-Riera, Gil Triginer, Francesc Moreno-Noguer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in full-head reconstruction have been obtained by optimizing
+a neural field through differentiable surface or volume rendering to represent
+a single scene. While these techniques achieve an unprecedented accuracy, they
+take several minutes, or even hours, due to the expensive optimization process
+required. In this work, we introduce InstantAvatar, a method that recovers
+full-head avatars from few images (down to just one) in a few seconds on
+commodity hardware. In order to speed up the reconstruction process, we propose
+a system that combines, for the first time, a voxel-grid neural field
+representation with a surface renderer. Notably, a naive combination of these
+two techniques leads to unstable optimizations that do not converge to valid
+solutions. In order to overcome this limitation, we present a novel statistical
+model that learns a prior distribution over 3D head signed distance functions
+using a voxel-grid based architecture. The use of this prior model, in
+combination with other design choices, results into a system that achieves 3D
+head reconstructions with comparable accuracy as the state-of-the-art with a
+100x speed-up.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and
+  Class-balanced Pseudo-Labeling <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoxiao Chen, Yadan Luo, Zheng Wang, Mahsa Baktashmotlagh, Zi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (DA) with the aid of pseudo labeling
+techniques has emerged as a crucial approach for domain-adaptive 3D object
+detection. While effective, existing DA methods suffer from a substantial drop
+in performance when applied to a multi-class training setting, due to the
+co-existence of low-quality pseudo labels and class imbalance issues. In this
+paper, we address this challenge by proposing a novel ReDB framework tailored
+for learning to detect all classes at once. Our approach produces Reliable,
+Diverse, and class-Balanced pseudo 3D boxes to iteratively guide the
+self-training on a distributionally different target domain. To alleviate
+disruptions caused by the environmental discrepancy (e.g., beam numbers), the
+proposed cross-domain examination (CDE) assesses the correctness of pseudo
+labels by copy-pasting target instances into a source environment and measuring
+the prediction consistency. To reduce computational overhead and mitigate the
+object shift (e.g., scales and point densities), we design an overlapped boxes
+counting (OBC) metric that allows to uniformly downsample pseudo-labeled
+objects across different geometric characteristics. To confront the issue of
+inter-class imbalance, we progressively augment the target point clouds with a
+class-balanced set of pseudo-labeled target instances and source objects, which
+boosts recognition accuracies on both frequently appearing and rare classes.
+Experimental results on three benchmark datasets using both voxel-based (i.e.,
+SECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our
+proposed ReDB approach outperforms existing 3D domain adaptation methods by a
+large margin, improving 23.15% mAP on the nuScenes $\rightarrow$ KITTI task.
+The code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context Autoencoder for <span class="highlight-title">Self-Supervised</span> Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03026v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03026v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaokang Chen, Mingyu Ding, Xiaodi Wang, Ying Xin, Shentong Mo, Yunhao Wang, Shumin Han, Ping Luo, Gang Zeng, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel masked image modeling (MIM) approach, context autoencoder
+(CAE), for self-supervised representation pretraining. We pretrain an encoder
+by making predictions in the encoded representation space. The pretraining
+tasks include two tasks: masked representation prediction - predict the
+representations for the masked patches, and masked patch reconstruction -
+reconstruct the masked patches. The network is an encoder-regressor-decoder
+architecture: the encoder takes the visible patches as input; the regressor
+predicts the representations of the masked patches, which are expected to be
+aligned with the representations computed from the encoder, using the
+representations of visible patches and the positions of visible and masked
+patches; the decoder reconstructs the masked patches from the predicted encoded
+representations. The CAE design encourages the separation of learning the
+encoder (representation) from completing the pertaining tasks: masked
+representation prediction and masked patch reconstruction tasks, and making
+predictions in the encoded representation space empirically shows the benefit
+to representation learning. We demonstrate the effectiveness of our CAE through
+superior transfer performance in downstream tasks: semantic segmentation,
+object detection and instance segmentation, and classification. The code will
+be available at https://github.com/Atten4Vis/CAE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IDiff-Face: Synthetic-based Face Recognition through Fizzy
+  Identity-Conditioned Diffusion Models <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadi Boutros, Jonas Henry Grebe, Arjan Kuijper, Naser Damer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of large-scale authentic face databases has been crucial to
+the significant advances made in face recognition research over the past
+decade. However, legal and ethical concerns led to the recent retraction of
+many of these databases by their creators, raising questions about the
+continuity of future face recognition research without one of its key
+resources. Synthetic datasets have emerged as a promising alternative to
+privacy-sensitive authentic data for face recognition development. However,
+recent synthetic datasets that are used to train face recognition models suffer
+either from limitations in intra-class diversity or cross-class (identity)
+discrimination, leading to less optimal accuracies, far away from the
+accuracies achieved by models trained on authentic data. This paper targets
+this issue by proposing IDiff-Face, a novel approach based on conditional
+latent diffusion models for synthetic identity generation with realistic
+identity variations for face recognition training. Through extensive
+evaluations, our proposed synthetic-based face recognition approach pushed the
+limits of state-of-the-art performances, achieving, for example, 98.00%
+accuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the
+recent synthetic-based face recognition solutions with 95.40% and bridging the
+gap to authentic-based face recognition with 99.82% accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Video Depth Stabilizer <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08695v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08695v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Wang, Min Shi, Jiaqi Li, Zihao Huang, Zhiguo Cao, Jianming Zhang, Ke Xian, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video depth estimation aims to infer temporally consistent depth. Some
+methods achieve temporal consistency by finetuning a single-image depth model
+during test time using geometry and re-projection constraints, which is
+inefficient and not robust. An alternative approach is to learn how to enforce
+temporal consistency from data, but this requires well-designed models and
+sufficient video depth data. To address these challenges, we propose a
+plug-and-play framework called Neural Video Depth Stabilizer (NVDS) that
+stabilizes inconsistent depth estimations and can be applied to different
+single-image depth models without extra effort. We also introduce a large-scale
+dataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with
+over two million frames, making it the largest natural-scene video depth
+dataset to our knowledge. We evaluate our method on the VDW dataset as well as
+two public benchmarks and demonstrate significant improvements in consistency,
+accuracy, and efficiency compared to previous approaches. Our work serves as a
+solid baseline and provides a data foundation for learning-based video depth
+models. We will release our dataset and code for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TORE: Token Reduction for Efficient Human Mesh Recovery with <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.10705v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.10705v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyang Dou, Qingxuan Wu, Cheng Lin, Zeyu Cao, Qiangqiang Wu, Weilin Wan, Taku Komura, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a set of simple yet effective TOken REduction
+(TORE) strategies for Transformer-based Human Mesh Recovery from monocular
+images. Current SOTA performance is achieved by Transformer-based structures.
+However, they suffer from high model complexity and computation cost caused by
+redundant tokens. We propose token reduction strategies based on two important
+aspects, i.e., the 3D geometry structure and 2D image feature, where we
+hierarchically recover the mesh geometry with priors from body structure and
+conduct token clustering to pass fewer but more discriminative image feature
+tokens to the Transformer. Our method massively reduces the number of tokens
+involved in high-complexity interactions in the Transformer. This leads to a
+significantly reduced computational cost while still achieving competitive or
+even higher accuracy in shape recovery. Extensive experiments across a wide
+range of benchmarks validate the superior effectiveness of the proposed method.
+We further demonstrate the generalizability of our method on hand mesh
+recovery. Visit our project page at
+https://frank-zy-dou.github.io/projects/Tore/index.html.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVDiffusion: Enabling Holistic Multi-view Image Generation with
+  Correspondence-Aware Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01097v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01097v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitao Tang, Fuyang Zhang, Jiacheng Chen, Peng Wang, Yasutaka Furukawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces MVDiffusion, a simple yet effective method for
+generating consistent multi-view images from text prompts given pixel-to-pixel
+correspondences (e.g., perspective crops from a panorama or multi-view images
+given depth maps and poses). Unlike prior methods that rely on iterative image
+warping and inpainting, MVDiffusion simultaneously generates all images with a
+global awareness, effectively addressing the prevalent error accumulation
+issue. At its core, MVDiffusion processes perspective images in parallel with a
+pre-trained text-to-image diffusion model, while integrating novel
+correspondence-aware attention layers to facilitate cross-view interactions.
+For panorama generation, while only trained with 10k panoramas, MVDiffusion is
+able to generate high-resolution photorealistic images for arbitrary texts or
+extrapolate one perspective image to a 360-degree view. For multi-view
+depth-to-image generation, MVDiffusion demonstrates state-of-the-art
+performance for texturing a scene mesh. The project page is at
+https://mvdiffusion.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page, https://mvdiffusion.github.io, new functionality,
+  improved results, better writing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Dense UV Completion for Human Mesh Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjun Wang, Qingping Sun, Wenjia Wang, Jun Ling, Zhongang Cai, Rong Xie, Li Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human mesh reconstruction from a single image is challenging in the presence
+of occlusion, which can be caused by self, objects, or other humans. Existing
+methods either fail to separate human features accurately or lack proper
+supervision for feature completion. In this paper, we propose Dense Inpainting
+Human Mesh Recovery (DIMR), a two-stage method that leverages dense
+correspondence maps to handle occlusion. Our method utilizes a dense
+correspondence map to separate visible human features and completes human
+features on a structured UV map dense human with an attention-based feature
+completion module. We also design a feature inpainting training procedure that
+guides the network to learn from unoccluded features. We evaluate our method on
+several datasets and demonstrate its superior performance under heavily
+occluded scenarios compared to other methods. Extensive experiments show that
+our method obviously outperforms prior SOTA methods on heavily occluded images
+and achieves comparable results on the standard benchmarks (3DPW).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Music-Dance Representations through Explicit-Implicit Rhythm
+  Synchronization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Yu, Junfu Pu, Ying Cheng, Rui Feng, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although audio-visual representation has been proved to be applicable in many
+downstream tasks, the representation of dancing videos, which is more specific
+and always accompanied by music with complex auditory contents, remains
+challenging and uninvestigated. Considering the intrinsic alignment between the
+cadent movement of dancer and music rhythm, we introduce MuDaR, a novel
+Music-Dance Representation learning framework to perform the synchronization of
+music and dance rhythms both in explicit and implicit ways. Specifically, we
+derive the dance rhythms based on visual appearance and motion cues inspired by
+the music rhythm analysis. Then the visual rhythms are temporally aligned with
+the music counterparts, which are extracted by the amplitude of sound
+intensity. Meanwhile, we exploit the implicit coherence of rhythms implied in
+audio and visual streams by contrastive learning. The model learns the joint
+embedding by predicting the temporal consistency between audio-visual pairs.
+The music-dance representation, together with the capability of detecting audio
+and visual rhythms, can further be applied to three downstream tasks: (a) dance
+classification, (b) music-dance retrieval, and (c) music-dance retargeting.
+Extensive experiments demonstrate that our proposed framework outperforms other
+self-supervised methods by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Multimedia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Model Adaptation for Cross-Condition Robustness in Semantic
+  Segmentation <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05194v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05194v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Bruggemann, Christos Sakaridis, Tim Brödermann, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard unsupervised domain adaptation methods adapt models from a source to
+a target domain using labeled source data and unlabeled target data jointly. In
+model adaptation, on the other hand, access to the labeled source data is
+prohibited, i.e., only the source-trained model and unlabeled target data are
+available. We investigate normal-to-adverse condition model adaptation for
+semantic segmentation, whereby image-level correspondences are available in the
+target domain. The target set consists of unlabeled pairs of adverse- and
+normal-condition street images taken at GPS-matched locations. Our method --
+CMA -- leverages such image pairs to learn condition-invariant features via
+contrastive learning. In particular, CMA encourages features in the embedding
+space to be grouped according to their condition-invariant semantic content and
+not according to the condition under which respective inputs are captured. To
+obtain accurate cross-domain semantic correspondences, we warp the normal image
+to the viewpoint of the adverse image and leverage warp-confidence scores to
+create robust, aggregated features. With this approach, we achieve
+state-of-the-art semantic segmentation performance for model adaptation on
+several normal-to-adverse adaptation benchmarks, such as ACDC and Dark Zurich.
+We also evaluate CMA on a newly procured adverse-condition generalization
+benchmark and report favorable results compared to standard unsupervised domain
+adaptation methods, despite the comparative handicap of CMA due to source data
+inaccessibility. Code is available at https://github.com/brdav/cma.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Computer Vision (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering Vision-Language Models to Follow Interleaved Vision-Language
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, Yueting Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently sparked significant
+interest, which demonstrates emergent capabilities to serve as a
+general-purpose model for various vision-language tasks. However, existing
+methods mainly focus on limited types of instructions with a single image as
+visual context, which hinders the widespread availability of MLLMs. In this
+paper, we introduce the I4 benchmark to comprehensively evaluate the
+instruction following ability on complicated interleaved vision-language
+instructions, which involve intricate image-text sequential context, covering a
+diverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture
+slides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a
+common defect of existing methods: the Visual Prompt Generator (VPG) trained on
+image-captioning alignment objective tends to attend to common foreground
+information for captioning but struggles to extract specific information
+required by particular tasks. To address this issue, we propose a generic and
+lightweight controllable knowledge re-injection module, which utilizes the
+sophisticated reasoning ability of LLMs to control the VPG to conditionally
+extract instruction-specific visual information and re-inject it into the LLM.
+Further, we introduce an annotation-free cross-attention guided counterfactual
+image training strategy to methodically learn the proposed module by
+collaborating a cascade of foundation models. Enhanced by the proposed module
+and training strategy, we present Cheetor, a Transformer-based MLLM that can
+effectively handle a wide variety of interleaved vision-language instructions
+and achieves state-of-the-art zero-shot performance across all tasks of I4,
+without high-quality multimodal instruction tuning data. Cheetor also exhibits
+competitive performance compared with state-of-the-art instruction tuned models
+on MME benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-metrics adaptively identifies backdoors in Federated learning <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siquan Huang, Yijiang Li, Chong Chen, Leyu Shi, Ying Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The decentralized and privacy-preserving nature of federated learning (FL)
+makes it vulnerable to backdoor attacks aiming to manipulate the behavior of
+the resulting model on specific adversary-chosen inputs. However, most existing
+defenses based on statistical differences take effect only against specific
+attacks, especially when the malicious gradients are similar to benign ones or
+the data are highly non-independent and identically distributed (non-IID). In
+this paper, we revisit the distance-based defense methods and discover that i)
+Euclidean distance becomes meaningless in high dimensions and ii) malicious
+gradients with diverse characteristics cannot be identified by a single metric.
+To this end, we present a simple yet effective defense strategy with
+multi-metrics and dynamic weighting to identify backdoors adaptively.
+Furthermore, our novel defense has no reliance on predefined assumptions over
+attack settings or data distributions and little impact on benign performance.
+To evaluate the effectiveness of our approach, we conduct comprehensive
+experiments on different datasets under various attack settings, where our
+method achieves the best defensive performance. For instance, we achieve the
+lowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing
+significant superiority over previous defenses. The results also demonstrate
+that our method can be well-adapted to a wide range of non-IID degrees without
+sacrificing the benign performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International
+  Conference on Computer Vision (ICCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Learning-Based <span class="highlight-title">Transformer</span> Network for Estimation of
+  Segmentation Errors <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha Sree C, Mohammad Al Fahim, Keerthi Ram, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many segmentation networks have been proposed for 3D volumetric segmentation
+of tumors and organs at risk. Hospitals and clinical institutions seek to
+accelerate and minimize the efforts of specialists in image segmentation.
+Still, in case of errors generated by these networks, clinicians would have to
+manually edit the generated segmentation maps. Given a 3D volume and its
+putative segmentation map, we propose an approach to identify and measure
+erroneous regions in the segmentation map. Our method can estimate error at any
+point or node in a 3D mesh generated from a possibly erroneous volumetric
+segmentation map, serving as a Quality Assurance tool. We propose a graph
+neural network-based transformer based on the Nodeformer architecture to
+measure and classify the segmentation errors at any point. We have evaluated
+our network on a high-resolution micro-CT dataset of the human inner-ear bony
+labyrinth structure by simulating erroneous 3D segmentation maps. Our network
+incorporates a convolutional encoder to compute node-centric features from the
+input micro-CT data, the Nodeformer to learn the latent graph embeddings, and a
+Multi-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our
+network achieves a mean absolute error of ~0.042 over other Graph Neural
+Networks (GNN) and an accuracy of 79.53% over other GNNs in estimating and
+classifying the node-wise errors, respectively. We also put forth vertex-normal
+prediction as a custom pretext task for pre-training the CNN encoder to improve
+the network's overall performance. Qualitative analysis shows the efficiency of
+our network in correctly classifying errors and reducing misclassifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MICCAI workshop on ShapeMI, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Closer Look at Audio-Visual Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhong Chen, Yuyuan Liu, Hu Wang, Fengbei Liu, Chong Wang, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual segmentation (AVS) is a complex task that involves accurately
+segmenting the corresponding sounding object based on audio-visual queries.
+Successful audio-visual learning requires two essential components: 1) an
+unbiased dataset with high-quality pixel-level multi-class labels, and 2) a
+model capable of effectively linking audio information with its corresponding
+visual object. However, these two requirements are only partially addressed by
+current methods, with training sets containing biased audio-visual data, and
+models that generalise poorly beyond this biased training set. In this work, we
+propose a new strategy to build cost-effective and relatively unbiased
+audio-visual semantic segmentation benchmarks. Our strategy, called Visual
+Post-production (VPO), explores the observation that it is not necessary to
+have explicit audio-visual pairs extracted from single video sources to build
+such benchmarks. We also refine the previously proposed AVSBench to transform
+it into the audio-visual semantic segmentation benchmark AVSBench-Single+.
+Furthermore, this paper introduces a new pixel-wise audio-visual contrastive
+learning method to enable a better generalisation of the model beyond the
+training set. We verify the validity of the VPO strategy by showing that
+state-of-the-art (SOTA) models trained with datasets built by matching audio
+and visual data from different sources or with datasets containing audio and
+visual data from the same video source produce almost the same accuracy. Then,
+using the proposed VPO benchmarks and AVSBench-Single+, we show that our method
+produces more accurate audio-visual semantic segmentation than SOTA models.
+Code and dataset will be available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIP-Count: Towards Text-Guided Zero-Shot Object Counting <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Jiang, Lingbo Liu, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in visual-language models have shown remarkable zero-shot
+text-image matching ability that is transferable to downstream tasks such as
+object detection and segmentation. Adapting these models for object counting,
+however, remains a formidable challenge. In this study, we first investigate
+transferring vision-language models (VLMs) for class-agnostic object counting.
+Specifically, we propose CLIP-Count, the first end-to-end pipeline that
+estimates density maps for open-vocabulary objects with text guidance in a
+zero-shot manner. To align the text embedding with dense visual features, we
+introduce a patch-text contrastive loss that guides the model to learn
+informative patch-level visual representations for dense prediction. Moreover,
+we design a hierarchical patch-text interaction module to propagate semantic
+information across different resolution levels of visual features. Benefiting
+from the full exploitation of the rich image-text alignment knowledge of
+pretrained VLMs, our method effectively generates high-quality density maps for
+objects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech
+crowd counting datasets demonstrate state-of-the-art accuracy and
+generalizability of the proposed method. Code is available:
+https://github.com/songrise/CLIP-Count.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StableVQA: A Deep No-Reference Quality Assessment Model for Video
+  Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04904v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04904v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengchuan Kou, Xiaohong Liu, Wei Sun, Jun Jia, Xiongkuo Min, Guangtao Zhai, Ning Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video shakiness is an unpleasant distortion of User Generated Content (UGC)
+videos, which is usually caused by the unstable hold of cameras. In recent
+years, many video stabilization algorithms have been proposed, yet no specific
+and accurate metric enables comprehensively evaluating the stability of videos.
+Indeed, most existing quality assessment models evaluate video quality as a
+whole without specifically taking the subjective experience of video stability
+into consideration. Therefore, these models cannot measure the video stability
+explicitly and precisely when severe shakes are present. In addition, there is
+no large-scale video database in public that includes various degrees of shaky
+videos with the corresponding subjective scores available, which hinders the
+development of Video Quality Assessment for Stability (VQA-S). To this end, we
+build a new database named StableDB that contains 1,952 diversely-shaky UGC
+videos, where each video has a Mean Opinion Score (MOS) on the degree of video
+stability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S
+model named StableVQA, which consists of three feature extractors to acquire
+the optical flow, semantic, and blur features respectively, and a regression
+layer to predict the final stability score. Extensive experiments demonstrate
+that the StableVQA achieves a higher correlation with subjective opinions than
+the existing VQA-S models and generic VQA models. The database and codes are
+available at https://github.com/QMME/StableVQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Semantic Segmentation <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Chen, Jiachen Lu, Xiatian Zhu, Li Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Generative Semantic Segmentation (GSS), a generative learning
+approach for semantic segmentation. Uniquely, we cast semantic segmentation as
+an image-conditioned mask generation problem. This is achieved by replacing the
+conventional per-pixel discriminative learning with a latent prior learning
+process. Specifically, we model the variational posterior distribution of
+latent variables given the segmentation mask. To that end, the segmentation
+mask is expressed with a special type of image (dubbed as maskige). This
+posterior distribution allows to generate segmentation masks unconditionally.
+To achieve semantic segmentation on a given image, we further introduce a
+conditioning network. It is optimized by minimizing the divergence between the
+posterior distribution of maskige (i.e., segmentation masks) and the latent
+prior distribution of input training images. Extensive experiments on standard
+benchmarks show that our GSS can perform competitively to prior art
+alternatives in the standard semantic segmentation setting, whilst achieving a
+new state of the art in the more challenging cross-domain setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at CVPR2023, code at http://github.com/fudan-zvg/GSS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks
+  for Defending Adversarial Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiufan Ji, Lin Wang, Cong Shi, Shengshan Hu, Yingying Chen, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to
+adversarial examples, threatening their practical deployment. Despite the many
+research endeavors have been made to tackle this issue in recent years, the
+diversity of adversarial examples on 3D point clouds makes them more
+challenging to defend against than those on 2D images. For examples, attackers
+can generate adversarial examples by adding, shifting, or removing points.
+Consequently, existing defense strategies are hard to counter unseen point
+cloud adversarial examples. In this paper, we first establish a comprehensive,
+and rigorous point cloud adversarial robustness benchmark to evaluate
+adversarial robustness, which can provide a detailed understanding of the
+effects of the defense and attack methods. We then collect existing defense
+tricks in point cloud adversarial defenses and then perform extensive and
+systematic experiments to identify an effective combination of these tricks.
+Furthermore, we propose a hybrid training augmentation methods that consider
+various types of point cloud adversarial examples to adversarial training,
+significantly improving the adversarial robustness. By combining these tricks,
+we construct a more robust defense framework achieving an average accuracy of
+83.45\% against various attacks, demonstrating its capability to enabling
+robust learners. Our codebase are open-sourced on:
+\url{https://github.com/qiufan319/benchmark_pc_attack.git}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RegFormer: An Efficient Projection-Aware <span class="highlight-title">Transformer</span> Network for
+  Large-Scale Point Cloud Registration <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12384v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12384v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuming Liu, Guangming Wang, Zhe Liu, Chaokang Jiang, Marc Pollefeys, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although point cloud registration has achieved remarkable advances in
+object-level and indoor scenes, large-scale registration methods are rarely
+explored. Challenges mainly arise from the huge point number, complex
+distribution, and outliers of outdoor LiDAR scans. In addition, most existing
+registration works generally adopt a two-stage paradigm: They first find
+correspondences by extracting discriminative local features and then leverage
+estimators (eg. RANSAC) to filter outliers, which are highly dependent on
+well-designed descriptors and post-processing choices. To address these
+problems, we propose an end-to-end transformer network (RegFormer) for
+large-scale point cloud alignment without any further post-processing.
+Specifically, a projection-aware hierarchical transformer is proposed to
+capture long-range dependencies and filter outliers by extracting point
+features globally. Our transformer has linear complexity, which guarantees high
+efficiency even for large-scale scenes. Furthermore, to effectively reduce
+mismatches, a bijective association transformer is designed for regressing the
+initial transformation. Extensive experiments on KITTI and NuScenes datasets
+demonstrate that our RegFormer achieves competitive performance in terms of
+both accuracy and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes are released at
+  https://github.com/IRMVLab/RegFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RemoteCLIP: A Vision Language Foundation Model for Remote Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Liu, Delong Chen, Zhangqingyun Guan, Xiaocong Zhou, Jiale Zhu, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  General-purpose foundation models have become increasingly important in the
+field of artificial intelligence. While self-supervised learning (SSL) and
+Masked Image Modeling (MIM) have led to promising results in building such
+foundation models for remote sensing, these models primarily learn low-level
+features, require annotated data for fine-tuning, and not applicable for
+retrieval and zero-shot applications due to the lack of language understanding.
+In response to these limitations, we propose RemoteCLIP, the first
+vision-language foundation model for remote sensing that aims to learn robust
+visual features with rich semantics, as well as aligned text embeddings for
+seamless downstream application. To address the scarcity of pre-training data,
+we leverage data scaling, converting heterogeneous annotations based on
+Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion, and further
+incorporating UAV imagery, resulting a 12xlarger pretraining dataset.
+RemoteCLIP can be applied to a variety of downstream tasks, including zero-shot
+image classification, linear probing, k-NN classification, few-shot
+classification, image-text retrieval, and object counting. Evaluations on 16
+datasets, including a newly introduced RemoteCount benchmark to test the object
+counting ability, show that RemoteCLIP consistently outperforms baseline
+foundation models across different model scales. Impressively, RemoteCLIP
+outperform previous SoTA by 9.14% mean recall on RSICD dataset and by 8.92% on
+RSICD dataset. For zero-shot classification, our RemoteCLIP outperform CLIP
+baseline by up to 6.39% average accuracy on 12 downstream datasets.Pretrained
+models is available at https://github.com/ChenDelong1999/RemoteCLIP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prototypical Kernel Learning and Open-set Foreground Perception for
+  Generalized Few-shot Semantic Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Huang, Feigege Wang, Ye Xi, Yutao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic
+Segmentation (FSS) to simultaneously segment unseen classes and seen classes
+during evaluation. Previous works leverage additional branch or prototypical
+aggregation to eliminate the constrained setting of FSS. However,
+representation division and embedding prejudice, which heavily results in poor
+performance of GFSS, have not been synthetical considered. We address the
+aforementioned problems by jointing the prototypical kernel learning and
+open-set foreground perception. Specifically, a group of learnable kernels is
+proposed to perform segmentation with each kernel in charge of a stuff class.
+Then, we explore to merge the prototypical learning to the update of base-class
+kernels, which is consistent with the prototype knowledge aggregation of
+few-shot novel classes. In addition, a foreground contextual perception module
+cooperating with conditional bias based inference is adopted to perform
+class-agnostic as well as open-set foreground detection, thus to mitigate the
+embedding prejudice and prevent novel targets from being misclassified as
+background. Moreover, we also adjust our method to the Class Incremental
+Few-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel
+classes in a incremental stream. Extensive experiments on PASCAL-5i and
+COCO-20i datasets demonstrate that our method performs better than previous
+state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextPainter: Multimodal Text Image Generation withVisual-harmony and
+  Text-comprehension for Poster Design <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04733v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04733v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Gao, Jinpeng Lin, Min Zhou, Chuanbin Liu, Hongtao Xie, Tiezheng Ge, Yuning Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text design is one of the most critical procedures in poster design, as it
+relies heavily on the creativity and expertise of humans to design text images
+considering the visual harmony and text-semantic. This study introduces
+TextPainter, a novel multimodal approach that leverages contextual visual
+information and corresponding text semantics to generate text images.
+Specifically, TextPainter takes the global-local background image as a hint of
+style and guides the text image generation with visual harmony. Furthermore, we
+leverage the language model and introduce a text comprehension module to
+achieve both sentence-level and word-level style variations. Besides, we
+construct the PosterT80K dataset, consisting of about 80K posters annotated
+with sentence-level bounding boxes and text contents. We hope this dataset will
+pave the way for further research on multimodal text image generation.
+Extensive quantitative and qualitative experiments demonstrate that TextPainter
+can generate visually-and-semantically-harmonious text images for posters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023. Dataset Link:
+  https://tianchi.aliyun.com/dataset/160034</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for
+  Patricia Wu-Murad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Manzini, Robin Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper details the challenges in applying two computer vision systems, an
+EfficientDET supervised learning model and the unsupervised RX spectral
+classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and
+rescue (WSAR) effort in Japan and identifies 3 directions for future research.
+There have been at least 19 proposed approaches and 3 datasets aimed at
+locating missing persons in drone imagery, but only 3 approaches (2
+unsupervised and 1 of an unknown structure) are referenced in the literature as
+having been used in an actual WSAR operation. Of these proposed approaches, the
+EfficientDET architecture and the unsupervised spectral RX classifier were
+selected as the most appropriate for this setting. The EfficientDET model was
+applied to the HERIDAL dataset and despite achieving performance that is
+statistically equivalent to the state-of-the-art, the model fails to translate
+to the real world in terms of false positives (e.g., identifying tree limbs and
+rocks as people), and false negatives (e.g., failing to identify members of the
+search team). The poor results in practice for algorithms that showed good
+results on datasets suggest 3 areas of future research: more realistic datasets
+for wilderness SAR, computer vision models that are capable of seamlessly
+handling the variety of imagery that can be collected during actual WSAR
+operations, and better alignment on performance measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Will Large-scale Generative Models Corrupt Future <span class="highlight-title">Dataset</span>s? <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryuichiro Hataya, Han Bao, Hiromi Arai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently proposed large-scale text-to-image generative models such as
+DALL$\cdot$E 2, Midjourney, and StableDiffusion can generate high-quality and
+realistic images from users' prompts. Not limited to the research community,
+ordinary Internet users enjoy these generative models, and consequently, a
+tremendous amount of generated images have been shared on the Internet.
+Meanwhile, today's success of deep learning in the computer vision field owes a
+lot to images collected from the Internet. These trends lead us to a research
+question: "\textbf{will such generated images impact the quality of future
+datasets and the performance of computer vision models positively or
+negatively?}" This paper empirically answers this question by simulating
+contamination. Namely, we generate ImageNet-scale and COCO-scale datasets using
+a state-of-the-art generative model and evaluate models trained with
+"contaminated" datasets on various tasks, including image classification and
+image generation. Throughout experiments, we conclude that generated images
+negatively affect downstream performance, while the significance depends on
+tasks and the amount of generated images. The generated datasets and the codes
+for experiments will be publicly released for future research. Generated
+datasets and source codes are available from
+\url{https://github.com/moskomule/dataset-contamination}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual-level Interaction for Domain Adaptive Semantic Segmentation <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07972v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07972v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyu Yao, Boheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-training approach recently secures its position in domain adaptive
+semantic segmentation, where a model is trained with target domain
+pseudo-labels. Current advances have mitigated noisy pseudo-labels resulting
+from the domain gap. However, they still struggle with erroneous pseudo-labels
+near the boundaries of the semantic classifier. In this paper, we tackle this
+issue by proposing a dual-level interaction for domain adaptation (DIDA) in
+semantic segmentation. Explicitly, we encourage the different augmented views
+of the same pixel to have not only similar class prediction (semantic-level)
+but also akin similarity relationship with respect to other pixels
+(instance-level). As it's impossible to keep features of all pixel instances
+for a dataset, we, therefore, maintain a labeled instance bank with dynamic
+updating strategies to selectively store the informative features of instances.
+Further, DIDA performs cross-level interaction with scattering and gathering
+techniques to regenerate more reliable pseudo-labels. Our method outperforms
+the state-of-the-art by a notable margin, especially on confusing and
+long-tailed classes. Code is available at
+\href{https://github.com/RainJamesY/DIDA}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCVW on Uncertainty Quantification for Computer Vision
+  (UnCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expeditious Saliency-guided Mix-up through Random Gradient Thresholding <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04875v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04875v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh-Long Luu, Zeyi Huang, Eric P. Xing, Yong Jae Lee, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mix-up training approaches have proven to be effective in improving the
+generalization ability of Deep Neural Networks. Over the years, the research
+community expands mix-up methods into two directions, with extensive efforts to
+improve saliency-guided procedures but minimal focus on the arbitrary path,
+leaving the randomization domain unexplored. In this paper, inspired by the
+superior qualities of each direction over one another, we introduce a novel
+method that lies at the junction of the two routes. By combining the best
+elements of randomness and saliency utilization, our method balances speed,
+simplicity, and accuracy. We name our method R-Mix following the concept of
+"Random Mix-up". We demonstrate its effectiveness in generalization, weakly
+supervised object localization, calibration, and robustness to adversarial
+attacks. Finally, in order to address the question of whether there exists a
+better decision protocol, we train a Reinforcement Learning agent that decides
+the mix-up policies based on the classifier's performance, reducing dependency
+on human-designed objectives and hyperparameter tuning. Extensive experiments
+further show that the agent is capable of performing at the cutting-edge level,
+laying the foundation for a fully automatic mix-up. Our code is released at
+[https://github.com/minhlong94/Random-Mixup].
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted Long paper at 2nd Practical-DL Workshop at AAAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Imaginative Generative Adversarial Network: Automatic Data
+  Augmentation for Dynamic Skeleton-Based Hand Gesture and Human Action
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.13061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.13061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiao Shen, John Dudley, Per Ola Kristensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning approaches deliver state-of-the-art performance in recognition
+of spatiotemporal human motion data. However, one of the main challenges in
+these recognition tasks is limited available training data. Insufficient
+training data results in over-fitting and data augmentation is one approach to
+address this challenge. Existing data augmentation strategies based on scaling,
+shifting and interpolating offer limited generalizability and typically require
+detailed inspection of the dataset as well as hundreds of GPU hours for
+hyperparameter optimization. In this paper, we present a novel automatic data
+augmentation model, the Imaginative Generative Adversarial Network (GAN), that
+approximates the distribution of the input data and samples new data from this
+distribution. It is automatic in that it requires no data inspection and little
+hyperparameter tuning and therefore it is a low-cost and low-effort approach to
+generate synthetic data. We demonstrate our approach on small-scale
+skeleton-based datasets with a comprehensive experimental analysis. Our results
+show that the augmentation strategy is fast to train and can improve
+classification accuracy for both conventional neural networks and
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision-Based UAV Self-Positioning in Low-Altitude Urban Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.09201v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.09201v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Dai, Enhui Zheng, Zhenhua Feng, Jiedong Zhuang, Wankou Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs) rely on satellite systems for stable
+positioning. However, due to limited satellite coverage or communication
+disruptions, UAVs may lose signals from satellite-based positioning systems. In
+such situations, vision-based techniques can serve as an alternative, ensuring
+the self-positioning capability of UAVs. However, most of the existing datasets
+are developed for the geo-localization tasks of the objects identified by UAVs,
+rather than the self-positioning task of UAVs. Furthermore, the current UAV
+datasets use discrete sampling on synthetic data, such as Google Maps, thereby
+neglecting the crucial aspects of dense sampling and the uncertainties commonly
+experienced in real-world scenarios. To address these issues, this paper
+presents a new dataset, DenseUAV, which is the first publicly available dataset
+designed for the UAV self-positioning task. DenseUAV adopts dense sampling on
+UAV images obtained in low-altitude urban settings. In total, over 27K UAV-view
+and satellite-view images of 14 university campuses are collected and
+annotated, establishing a new benchmark. In terms of model development, we
+first verify the superiority of Transformers over CNNs in this task. Then, we
+incorporate metric learning into representation learning to enhance the
+discriminative capacity of the model and to lessen the modality discrepancy.
+Besides, to facilitate joint learning from both perspectives, we propose a
+mutually supervised learning approach. Last, we enhance the Recall@K metric and
+introduce a new measurement, SDM@K, to evaluate the performance of a trained
+model from both the retrieval and localization perspectives simultaneously. As
+a result, the proposed baseline method achieves a remarkable Recall@1 score of
+83.05% and an SDM@1 score of 86.24% on DenseUAV. The dataset and code will be
+made publicly available on https://github.com/Dmmm1997/DenseUAV.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSLRec: A <span class="highlight-title">Self-Supervised</span> Learning Library for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xubin Ren, Lianghao Xia, Yuhao Yang, Wei Wei, Tianle Wang, Xuheng Cai, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has gained significant interest in recent
+years as a solution to address the challenges posed by sparse and noisy data in
+recommender systems. Despite the growing number of SSL algorithms designed to
+provide state-of-the-art performance in various recommendation scenarios (e.g.,
+graph collaborative filtering, sequential recommendation, social
+recommendation, KG-enhanced recommendation), there is still a lack of unified
+frameworks that integrate recommendation algorithms across different domains.
+Such a framework could serve as the cornerstone for self-supervised
+recommendation algorithms, unifying the validation of existing methods and
+driving the design of new ones. To address this gap, we introduce SSLRec, a
+novel benchmark platform that provides a standardized, flexible, and
+comprehensive framework for evaluating various SSL-enhanced recommenders. The
+SSLRec library features a modular architecture that allows users to easily
+evaluate state-of-the-art models and a complete set of data augmentation and
+self-supervised toolkits to help create SSL recommendation models with specific
+needs. Furthermore, SSLRec simplifies the process of training and evaluating
+different recommendation models with consistent and fair settings. Our SSLRec
+platform covers a comprehensive set of state-of-the-art SSL-enhanced
+recommendation models across different scenarios, enabling researchers to
+evaluate these cutting-edge models and drive further innovation in the field.
+Our implemented SSLRec framework is available at the source code repository
+https://github.com/HKUDS/SSLRec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling
+  Cross-Lingual, Cross-<span class="highlight-title">Dataset</span> and Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iknoor Singh, Carolina Scarton, Xingyi Song, Kalina Bontcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of retrieving already debunked narratives aims to detect stories
+that have already been fact-checked. The successful detection of claims that
+have already been debunked not only reduces the manual efforts of professional
+fact-checkers but can also contribute to slowing the spread of misinformation.
+Mainly due to the lack of readily available data, this is an understudied
+problem, particularly when considering the cross-lingual task, i.e. the
+retrieval of fact-checking articles in a language different from the language
+of the online post being checked. This paper fills this gap by (i) creating a
+novel dataset to enable research on cross-lingual retrieval of already debunked
+narratives, using tweets as queries to a database of fact-checking articles;
+(ii) presenting an extensive experiment to benchmark fine-tuned and
+off-the-shelf multilingual pre-trained Transformer models for this task; and
+(iii) proposing a novel multistage framework that divides this cross-lingual
+debunk retrieval task into refinement and re-ranking stages. Results show that
+the task of cross-lingual retrieval of already debunked narratives is
+challenging and off-the-shelf Transformer models fail to outperform a strong
+lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework
+is robust, outperforming BM25 in most scenarios and enabling cross-domain and
+zero-shot learning, without significantly harming the model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Ruas, Diana F. Sousa, André Neves, Carlos Cruz, Francisco M. Couto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical Natural Language Processing (NLP) tends to become cumbersome for
+most researchers, frequently due to the amount and heterogeneity of text to be
+processed. To address this challenge, the industry is continuously developing
+highly efficient tools and creating more flexible engineering solutions. This
+work presents the integration between industry data engineering solutions for
+efficient data processing and academic systems developed for Named Entity
+Recognition (LasigeUnicage\_NER) and Relation Extraction (BiOnt). Our design
+reflects an integration of those components with external knowledge in the form
+of additional training data from other datasets and biomedical ontologies. We
+used this pipeline in the 2022 LitCoin NLP Challenge, where our team
+LasigeUnicage was awarded the 7th Prize out of approximately 200 participating
+teams, reflecting a successful collaboration between the academia (LASIGE) and
+the industry (Unicage). The software supporting this work is available at
+\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-domain Recommendation with Embedding Disentangling and Domain
+  Alignment <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Ning, Xiao Yan, Weiwen Liu, Reynold Cheng, Rui Zhang, Bo Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-domain recommendation (MDR) aims to provide recommendations for
+different domains (e.g., types of products) with overlapping users/items and is
+common for platforms such as Amazon, Facebook, and LinkedIn that host multiple
+services. Existing MDR models face two challenges: First, it is difficult to
+disentangle knowledge that generalizes across domains (e.g., a user likes cheap
+items) and knowledge specific to a single domain (e.g., a user likes blue
+clothing but not blue cars). Second, they have limited ability to transfer
+knowledge across domains with small overlaps. We propose a new MDR method named
+EDDA with two key components, i.e., embedding disentangling recommender and
+domain alignment, to tackle the two challenges respectively. In particular, the
+embedding disentangling recommender separates both the model and embedding for
+the inter-domain part and the intra-domain part, while most existing MDR
+methods only focus on model-level disentangling. The domain alignment leverages
+random walks from graph processing to identify similar user/item pairs from
+different domains and encourages similar user/item pairs to have similar
+embeddings, enhancing knowledge transfer. We compare EDDA with 12
+state-of-the-art baselines on 3 real datasets. The results show that EDDA
+consistently outperforms the baselines on all datasets and domains. All
+datasets and codes are available at https://github.com/Stevenn9981/EDDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bringing order into the realm of <span class="highlight-title">Transformer</span>-based language models for
+  artificial intelligence and law 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Candida M. Greco, Andrea Tagarelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based language models (TLMs) have widely been recognized to be a
+cutting-edge technology for the successful development of deep-learning-based
+solutions to problems and applications that require natural language processing
+and understanding. Like for other textual domains, TLMs have indeed pushed the
+state-of-the-art of AI approaches for many tasks of interest in the legal
+domain. Despite the first Transformer model being proposed about six years ago,
+there has been a rapid progress of this technology at an unprecedented rate,
+whereby BERT and related models represent a major reference, also in the legal
+domain. This article provides the first systematic overview of TLM-based
+methods for AI-driven problems and tasks in the legal sphere. A major goal is
+to highlight research advances in this field so as to understand, on the one
+hand, how the Transformers have contributed to the success of AI in supporting
+legal processes, and on the other hand, what are the current limitations and
+opportunities for further research development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication with Artificial Intelligence and Law,
+  Springer Nature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Product <span class="highlight-title">Review</span> Image Ranking for Fashion E-commerce <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangeet Jaiswal, Dhruv Patel, Sreekanth Vempati, Konduru Saiswaroop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a fashion e-commerce platform where customers can't physically examine the
+products on their own, being able to see other customers' text and image
+reviews of the product is critical while making purchase decisions. Given the
+high reliance on these reviews, over the years we have observed customers
+proactively sharing their reviews. With an increase in the coverage of User
+Generated Content (UGC), there has been a corresponding increase in the number
+of customer images. It is thus imperative to display the most relevant images
+on top as it may influence users' online shopping choices and behavior. In this
+paper, we propose a simple yet effective training procedure for ranking
+customer images. We created a dataset consisting of Myntra (A Major Indian
+Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)
+UGC images as our starting point and used selected distortion techniques on the
+images of the above dataset to bring their quality at par with those of bad UGC
+images. We train our network to rank bad-quality images lower than high-quality
+ones. Our proposed method outperforms the baseline models on two metrics,
+namely correlation coefficient, and accuracy, by substantial margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR
+  eCom'22)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with
+  <span class="highlight-title">Self-supervised</span> Learning <span class="chip">CIKM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Chen, Wei Chen, Jia Xu, Zhongyi Liu, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relevance modeling aims to locate desirable items for corresponding queries,
+which is crucial for search engines to ensure user experience. Although most
+conventional approaches address this problem by assessing the semantic
+similarity between the query and item, pure semantic matching is not
+everything. In reality, auxiliary query-item interactions extracted from user
+historical behavior data of the search log could provide hints to reveal users'
+search intents further. Drawing inspiration from this, we devise a novel
+Behavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that
+leverages neighbor queries of target item and neighbor items of target query to
+complement target query-item semantic matching. Specifically, our model builds
+multi-level co-attention for distilling coarse-grained and fine-grained
+semantic representations from both neighbor and target views. The model
+subsequently employs neighbor-target self-supervised learning to improve the
+accuracy and robustness of BARL-ASe by strengthening representation and logit
+learning. Furthermore, we discuss how to deal with the long-tail query-item
+matching of the mini apps search scenario of Alipay practically. Experiments on
+real-world industry data and online A/B testing demonstrate our proposal
+achieves promising performance with low latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating disaster response through social media data and the
+  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.
+  wildfire season 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihui Ma, Lingyao Li, Libby Hemphill, Gregory B. Baecher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective disaster response is critical for affected communities. Responders
+and decision-makers would benefit from reliable, timely measures of the issues
+impacting their communities during a disaster, and social media offers a
+potentially rich data source. Social media can reflect public concerns and
+demands during a disaster, offering valuable insights for decision-makers to
+understand evolving situations and optimize resource allocation. We used
+Bidirectional Encoder Representations from Transformers (BERT) topic modeling
+to cluster topics from Twitter data. Then, we conducted a temporal-spatial
+analysis to examine the distribution of these topics across different regions
+during the 2020 western U.S. wildfire season. Our results show that Twitter
+users mainly focused on three topics:"health impact," "damage," and
+"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to
+explore the magnitude and velocity of topic diffusion on Twitter. The results
+displayed a clear relationship between topic trends and wildfire propagation
+patterns. The estimated parameters obtained from the SIR model in selected
+cities revealed that residents exhibited a high level of several concerns
+during the wildfire. Our study details how the SIR model and topic modeling
+using social media data can provide decision-makers with a quantitative
+approach to measure disaster response and support their decision-making
+processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Retrieval to Generation: Efficient and Effective Entity Set
+  Expansion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shulin Huang, Shirong Ma, Yangning Li, Yinghui Li, Hai-Tao Zheng, Yong Jiang, Hong-Gee Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Set Expansion (ESE) is a critical task aiming to expand entities of
+the target semantic class described by a small seed entity set. Most existing
+ESE methods are retrieval-based frameworks that need to extract the contextual
+features of entities and calculate the similarity between seed entities and
+candidate entities. To achieve the two purposes, they should iteratively
+traverse the corpus and the entity vocabulary provided in the datasets,
+resulting in poor efficiency and scalability. The experimental results indicate
+that the time consumed by the retrieval-based ESE methods increases linearly
+with entity vocabulary and corpus size. In this paper, we firstly propose a
+generative ESE framework, Generative Entity Set Expansion (GenExpan), which
+utilizes a generative pre-trained language model to accomplish ESE task.
+Specifically, a prefix tree is employed to guarantee the validity of entity
+generation, and automatically generated class names are adopted to guide the
+model to generate target entities. Moreover, we propose Knowledge Calibration
+and Generative Ranking to further bridge the gap between generic knowledge of
+the language model and the goal of ESE task. Experiments on publicly available
+datasets show that GenExpan is efficient and effective. For efficiency,
+expansion time consumed by GenExpan is independent of entity vocabulary and
+corpus size, and GenExpan achieves an average 600% speedup compared to strong
+baselines. For expansion performance, our framework outperforms previous
+state-of-the-art ESE methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Metric Search for Rank List Compatibility Matching with Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Guo, Jeffrey Uhlmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As online dating has become more popular in the past few years, an efficient
+and effective algorithm to match users is needed. In this project, we proposed
+a new dating matching algorithm that uses Kendall-Tau distance to measure the
+similarity between users based on their ranking for items in a list. (e.g.,
+their favourite sports, music, etc.) To increase the performance of the search
+process, we applied a tree-based searching structure, Cascading Metric Tree
+(CMT), on this metric. The tree is built on ranked lists from all the users;
+when a query target and a radius are provided, our algorithm can return users
+within the radius of the target. We tested the scaling of this searching method
+on a synthetic dataset by varying list length, population size, and query
+radius. We observed that the algorithm is able to query the best matching
+people for the user in a practical time, given reasonable parameters. We also
+provided potential future improvements that can be made to this algorithm based
+on the limitations. Finally, we offered more use cases of this search structure
+on Kendall-Tau distance and new insight into real-world applications of
+distance search structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper for 2023 Multidisciplinary Undergraduate Research Conference
+  (MURC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Intents Graph Modeling for User-centric Group Discovery <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xixi Wu, Yun Xiong, Yao Zhang, Yizhu Jiao, Jiawei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online groups have become increasingly prevalent, providing users with space
+to share experiences and explore interests. Therefore, user-centric group
+discovery task, i.e., recommending groups to users can help both users' online
+experiences and platforms' long-term developments. Existing recommender methods
+can not deal with this task as modeling user-group participation into a
+bipartite graph overlooks their item-side interests. Although there exist a few
+works attempting to address this task, they still fall short in fully
+preserving the social context and ensuring effective interest representation
+learning.
+  In this paper, we focus on exploring the intents that motivate users to
+participate in groups, which can be categorized into different types, like the
+social-intent and the personal interest-intent. The former refers to users
+joining a group affected by their social links, while the latter relates to
+users joining groups with like-minded people for self-enjoyment. To comprehend
+different intents, we propose a novel model, DiRec, that first models each
+intent separately and then fuses them together for predictions. Specifically,
+for social-intent, we introduce the hypergraph structure to model the
+relationship between groups and members, leading to a richer understanding of
+the social context. As for interest-intent, we employ novel structural
+refinement on the interactive graph to uncover more intricate user behaviors
+and group interests, realizing better representation learning of interests.
+Furthermore, we also observe the intent overlapping in real-world scenarios and
+devise a novel self-supervised learning loss that encourages such alignment for
+final recommendations. Extensive experiments on three public datasets show the
+significant improvement of DiRec over the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM'23 as Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative filtering to capture AI user's preferences as norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02542v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02542v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Serramia, Natalia Criado, Michael Luck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customising AI technologies to each user's preferences is fundamental to them
+functioning well. Unfortunately, current methods require too much user
+involvement and fail to capture their true preferences. In fact, to avoid the
+nuisance of manually setting preferences, users usually accept the default
+settings even if these do not conform to their true preferences. Norms can be
+useful to regulate behaviour and ensure it adheres to user preferences but,
+while the literature has thoroughly studied norms, most proposals take a formal
+perspective. Indeed, while there has been some research on constructing norms
+to capture a user's privacy preferences, these methods rely on domain knowledge
+which, in the case of AI technologies, is difficult to obtain and maintain. We
+argue that a new perspective is required when constructing norms, which is to
+exploit the large amount of preference information readily available from whole
+systems of users. Inspired by recommender systems, we believe that
+collaborative filtering can offer a suitable approach to identifying a user's
+norm preferences without excessive user involvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted manuscript at the 24th International Conference on
+  Principles and Practice of Multi-Agent Systems (PRIMA 2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">102</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Progressive Meshes <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun-Chun Chen, Vladimir G. Kim, Noam Aigerman, Alec Jacobson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent proliferation of 3D content that can be consumed on hand-held
+devices necessitates efficient tools for transmitting large geometric data,
+e.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a
+challenge to storage as well as transmission bandwidth, and level-of-detail
+techniques are often used to transmit an asset using an appropriate bandwidth
+budget. It is especially desirable for these methods to transmit data
+progressively, improving the quality of the geometry with more data. Our key
+insight is that the geometric details of 3D meshes often exhibit similar local
+patterns even across different shapes, and thus can be effectively represented
+with a shared learned generative space. We learn this space using a
+subdivision-based encoder-decoder architecture trained in advance on a large
+collection of surfaces. We further observe that additional residual features
+can be transmitted progressively between intermediate levels of subdivision
+that enable the client to control the tradeoff between bandwidth cost and
+quality of reconstruction, providing a neural progressive mesh representation.
+We evaluate our method on a diverse set of complex 3D shapes and demonstrate
+that it outperforms baselines in terms of compression ratio and reconstruction
+quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero Grads Ever Given: Learning Local Surrogate Losses for
+  Non-Differentiable Graphics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Fischer, Tobias Ritschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based optimization is now ubiquitous across graphics, but
+unfortunately can not be applied to problems with undefined or zero gradients.
+To circumvent this issue, the loss function can be manually replaced by a
+"surrogate" that has similar minima but is differentiable. Our proposed
+framework, ZeroGrads, automates this process by learning a neural approximation
+of the objective function, the surrogate, which in turn can be used to
+differentiate through arbitrary black-box graphics pipelines. We train the
+surrogate on an actively smoothed version of the objective and encourage
+locality, focusing the surrogate's capacity on what matters at the current
+training episode. The fitting is performed online, alongside the parameter
+optimization, and self-supervised, without pre-computed data or pre-trained
+models. As sampling the objective is expensive (it requires a full rendering or
+simulator run), we devise an efficient sampling scheme that allows for
+tractable run-times and competitive performance at little overhead. We
+demonstrate optimizing diverse non-convex, non-differentiable black-box
+problems in graphics, such as visibility in rendering, discrete parameter
+spaces in procedural modelling or optimal control in physics-driven animation.
+In contrast to more traditional algorithms, our approach scales well to higher
+dimensions, which we demonstrate on problems with up to 35k interlinked
+variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow Anything: Open-set detection, tracking, and following in
+  real-time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Maalouf, Ninad Jadhav, Krishna Murthy Jatavallabhula, Makram Chahine, Daniel M. Vogt, Robert J. Wood, Antonio Torralba, Daniela Rus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking and following objects of interest is critical to several robotics
+use cases, ranging from industrial automation to logistics and warehousing, to
+healthcare and security. In this paper, we present a robotic system to detect,
+track, and follow any object in real-time. Our approach, dubbed ``follow
+anything'' (FAn), is an open-vocabulary and multimodal model -- it is not
+restricted to concepts seen at training time and can be applied to novel
+classes at inference time using text, images, or click queries. Leveraging rich
+visual descriptors from large-scale pre-trained models (foundation models), FAn
+can detect and segment objects by matching multimodal queries (text, images,
+clicks) against an input image sequence. These detected and segmented objects
+are tracked across image frames, all while accounting for occlusion and object
+re-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial
+vehicle) and report its ability to seamlessly follow the objects of interest in
+a real-time control loop. FAn can be deployed on a laptop with a lightweight
+(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To
+enable rapid adoption, deployment, and extensibility, we open-source all our
+code on our project webpage at https://github.com/alaamaalouf/FollowAnything .
+We also encourage the reader the watch our 5-minutes explainer video in this
+https://www.youtube.com/watch?v=6Mgt3EPytrw .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://github.com/alaamaalouf/FollowAnything
+  Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PDE-Refiner: Achieving Accurate Long Rollouts with Neural PDE Solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phillip Lippe, Bastiaan S. Veeling, Paris Perdikaris, Richard E. Turner, Johannes Brandstetter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-dependent partial differential equations (PDEs) are ubiquitous in
+science and engineering. Recently, mostly due to the high computational cost of
+traditional solution techniques, deep neural network based surrogates have
+gained increased interest. The practical utility of such neural PDE solvers
+relies on their ability to provide accurate, stable predictions over long time
+horizons, which is a notoriously hard problem. In this work, we present a
+large-scale analysis of common temporal rollout strategies, identifying the
+neglect of non-dominant spatial frequency information, often associated with
+high frequencies in PDE solutions, as the primary pitfall limiting stable,
+accurate rollout performance. Based on these insights, we draw inspiration from
+recent advances in diffusion models to introduce PDE-Refiner; a novel model
+class that enables more accurate modeling of all frequency components via a
+multistep refinement process. We validate PDE-Refiner on challenging benchmarks
+of complex fluid dynamics, demonstrating stable and accurate rollouts that
+consistently outperform state-of-the-art models, including neural, numerical,
+and hybrid neural-numerical architectures. We further demonstrate that
+PDE-Refiner greatly enhances data efficiency, since the denoising objective
+implicitly induces a novel form of spectral data augmentation. Finally,
+PDE-Refiner's connection to diffusion models enables an accurate and efficient
+assessment of the model's predictive uncertainty, allowing us to estimate when
+the surrogate becomes inaccurate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://phlippe.github.io/PDERefiner/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Integration of Prediction and Planning in Deep Learning-Based
+  Automated Driving Systems: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Hagedorn, Marcel Hallgarten, Martin Stoll, Alexandru Condurache
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated driving has the potential to revolutionize personal, public, and
+freight mobility. Besides the enormous challenge of perception, i.e. accurately
+perceiving the environment using available sensor data, automated driving
+comprises planning a safe, comfortable, and efficient motion trajectory. To
+promote safety and progress, many works rely on modules that predict the future
+motion of surrounding traffic. Modular automated driving systems commonly
+handle prediction and planning as sequential separate tasks. While this
+accounts for the influence of surrounding traffic on the ego-vehicle, it fails
+to anticipate the reactions of traffic participants to the ego-vehicle's
+behavior. Recent works suggest that integrating prediction and planning in an
+interdependent joint step is necessary to achieve safe, efficient, and
+comfortable driving. While various models implement such integrated systems, a
+comprehensive overview and theoretical understanding of different principles
+are lacking. We systematically review state-of-the-art deep learning-based
+prediction, planning, and integrated prediction and planning models. Different
+facets of the integration ranging from model architecture and model design to
+behavioral aspects are considered and related to each other. Moreover, we
+discuss the implications, strengths, and limitations of different integration
+methods. By pointing out research gaps, describing relevant future challenges,
+and highlighting trends in the research field, we identify promising directions
+for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech
+  Resynthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tu Anh Nguyen, Wei-Ning Hsu, Antony D'Avirro, Bowen Shi, Itai Gat, Maryam Fazel-Zarani, Tal Remez, Jade Copet, Gabriel Synnaeve, Michael Hassid, Felix Kreuk, Yossi Adi, Emmanuel Dupoux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that it is possible to resynthesize high-quality speech
+based, not on text, but on low bitrate discrete units that have been learned in
+a self-supervised fashion and can therefore capture expressive aspects of
+speech that are hard to transcribe (prosody, voice styles, non-verbal
+vocalization). The adoption of these methods is still limited by the fact that
+most speech synthesis datasets are read, severely limiting spontaneity and
+expressivity. Here, we introduce Expresso, a high-quality expressive speech
+dataset for textless speech synthesis that includes both read speech and
+improvised dialogues rendered in 26 spontaneous expressive styles. We
+illustrate the challenges and potentials of this dataset with an expressive
+resynthesis benchmark where the task is to encode the input in low-bitrate
+units and resynthesize it in a target voice while preserving content and style.
+We evaluate resynthesis quality with automatic metrics for different
+self-supervised discrete encoders, and explore tradeoffs between quality,
+bitrate and invariance to speaker and style. All the dataset, evaluation
+metrics and baseline models are open source
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Performance of Feedforward and Convolutional Neural Networks
+  through Dynamic Activation Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Rane, Kanishka Tyagi, Michael Manry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning training training algorithms are a huge success in recent years
+in many fields including speech, text,image video etc. Deeper and deeper layers
+are proposed with huge success with resnet structures having around 152 layers.
+Shallow convolution neural networks(CNN's) are still an active research, where
+some phenomena are still unexplained. Activation functions used in the network
+are of utmost importance, as they provide non linearity to the networks. Relu's
+are the most commonly used activation function.We show a complex piece-wise
+linear(PWL) activation in the hidden layer. We show that these PWL activations
+work much better than relu activations in our networks for convolution neural
+networks and multilayer perceptrons. Result comparison in PyTorch for shallow
+and deep CNNs are given to further strengthen our case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission in Neurocomputing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparison of Classical and Deep Reinforcement Learning Methods for
+  HVAC Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marshall Wang, John Willes, Thomas Jiralerspong, Matin Moezzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) is a promising approach for optimizing HVAC
+control. RL offers a framework for improving system performance, reducing
+energy consumption, and enhancing cost efficiency. We benchmark two popular
+classical and deep RL methods (Q-Learning and Deep-Q-Networks) across multiple
+HVAC environments and explore the practical consideration of model
+hyper-parameter selection and reward tuning. The findings provide insight for
+configuring RL agents in HVAC systems, promoting energy-efficient and
+cost-effective operation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shadow <span class="highlight-title">Dataset</span>s, New challenging <span class="highlight-title">dataset</span>s for Causal Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Zhu, Hanchen Xie, Jianhua Wu, Jiazhi Li, Mahyar Khayatkhoei, Mohamed E. Hussein, Wael AbdAlmageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering causal relations among semantic factors is an emergent topic in
+representation learning. Most causal representation learning (CRL) methods are
+fully supervised, which is impractical due to costly labeling. To resolve this
+restriction, weakly supervised CRL methods were introduced. To evaluate CRL
+performance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and
+CelebA(SMILE), are utilized. However, existing CRL datasets are limited to
+simple graphs with few generative factors. Thus we propose two new datasets
+with a larger number of diverse generative factors and more sophisticated
+causal graphs. In addition, current real datasets, CelebA(BEARD) and
+CelebA(SMILE), the originally proposed causal graphs are not aligned with the
+dataset distributions. Thus, we propose modifications to them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action
+  Recognition with Skeleton-Motion-Informed Gradient 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzhi Lu, He Wang, Ziyi Chang, Guoan Yang, Hubert P. H. Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, methods for skeleton-based human activity recognition have been
+shown to be vulnerable to adversarial attacks. However, these attack methods
+require either the full knowledge of the victim (i.e. white-box attacks),
+access to training data (i.e. transfer-based attacks) or frequent model queries
+(i.e. black-box attacks). All their requirements are highly restrictive,
+raising the question of how detrimental the vulnerability is. In this paper, we
+show that the vulnerability indeed exists. To this end, we consider a new
+attack task: the attacker has no access to the victim model or the training
+data or labels, where we coin the term hard no-box attack. Specifically, we
+first learn a motion manifold where we define an adversarial loss to compute a
+new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our
+gradient contains information of the motion dynamics, which is different from
+existing gradient-based attack methods that compute the loss gradient assuming
+each dimension in the data is independent. The SMI gradient can augment many
+gradient-based attack methods, leading to a new family of no-box attack
+methods. Extensive evaluation and comparison show that our method imposes a
+real threat to existing classifiers. They also show that the SMI gradient
+improves the transferability and imperceptibility of adversarial samples in
+both no-box and transfer-based black-box settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling
+  Cross-Lingual, Cross-<span class="highlight-title">Dataset</span> and Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iknoor Singh, Carolina Scarton, Xingyi Song, Kalina Bontcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of retrieving already debunked narratives aims to detect stories
+that have already been fact-checked. The successful detection of claims that
+have already been debunked not only reduces the manual efforts of professional
+fact-checkers but can also contribute to slowing the spread of misinformation.
+Mainly due to the lack of readily available data, this is an understudied
+problem, particularly when considering the cross-lingual task, i.e. the
+retrieval of fact-checking articles in a language different from the language
+of the online post being checked. This paper fills this gap by (i) creating a
+novel dataset to enable research on cross-lingual retrieval of already debunked
+narratives, using tweets as queries to a database of fact-checking articles;
+(ii) presenting an extensive experiment to benchmark fine-tuned and
+off-the-shelf multilingual pre-trained Transformer models for this task; and
+(iii) proposing a novel multistage framework that divides this cross-lingual
+debunk retrieval task into refinement and re-ranking stages. Results show that
+the task of cross-lingual retrieval of already debunked narratives is
+challenging and off-the-shelf Transformer models fail to outperform a strong
+lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework
+is robust, outperforming BM25 in most scenarios and enabling cross-domain and
+zero-shot learning, without significantly harming the model's performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AST-MHSA : Code Summarization using Multi-Head Self-Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeshwanth Nagaraj, Ujjwal Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code summarization aims to generate concise natural language descriptions for
+source code. The prevailing approaches adopt transformer-based encoder-decoder
+architectures, where the Abstract Syntax Tree (AST) of the source code is
+utilized for encoding structural information. However, ASTs are much longer
+than the corresponding source code, and existing methods ignore this size
+constraint by directly feeding the entire linearized AST into the encoders.
+This simplistic approach makes it challenging to extract truly valuable
+dependency relations from the overlong input sequence and leads to significant
+computational overhead due to self-attention applied to all nodes in the AST.
+  To address this issue effectively and efficiently, we present a model,
+AST-MHSA that uses multi-head attention to extract the important semantic
+information from the AST. The model consists of two main components: an encoder
+and a decoder. The encoder takes as input the abstract syntax tree (AST) of the
+code and generates a sequence of hidden states. The decoder then takes these
+hidden states as input and generates a natural language summary of the code.
+  The multi-head attention mechanism allows the model to learn different
+representations of the input code, which can be combined to generate a more
+comprehensive summary. The model is trained on a dataset of code and summaries,
+and the parameters of the model are optimized to minimize the loss between the
+generated summaries and the ground-truth summaries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqiang Fan, Xiaohao Cai, Mahesan Niranjan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated medical report generation has become increasingly important in
+medical analysis. It can produce computer-aided diagnosis descriptions and thus
+significantly alleviate the doctors' work. Inspired by the huge success of
+neural machine translation and image captioning, various deep learning methods
+have been proposed for medical report generation. However, due to the inherent
+properties of medical data, including data imbalance and the length and
+correlation between report sequences, the generated reports by existing methods
+may exhibit linguistic fluency but lack adequate clinical accuracy. In this
+work, we propose an image-to-indicator hierarchical transformer (IIHT)
+framework for medical report generation. It consists of three modules, i.e., a
+classifier module, an indicator expansion module and a generator module. The
+classifier module first extracts image features from the input medical images
+and produces disease-related indicators with their corresponding states. The
+disease-related indicators are subsequently utilised as input for the indicator
+expansion module, incorporating the "data-text-data" strategy. The
+transformer-based generator then leverages these extracted features along with
+image features as auxiliary information to generate final reports. Furthermore,
+the proposed IIHT method is feasible for radiologists to modify disease
+indicators in real-world scenarios and integrate the operations into the
+indicator expansion module for fluent and accurate medical report generation.
+Extensive experiments and comparisons with state-of-the-art methods under
+various evaluation metrics demonstrate the great performance of the proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReLU and Addition-based Gated RNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rickard Brännvall, Henrik Forsgren, Fredrik Sandin, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We replace the multiplication and sigmoid function of the conventional
+recurrent gate with addition and ReLU activation. This mechanism is designed to
+maintain long-term memory for sequence processing but at a reduced
+computational cost, thereby opening up for more efficient execution or larger
+models on restricted hardware. Recurrent Neural Networks (RNNs) with gating
+mechanisms such as LSTM and GRU have been widely successful in learning from
+sequential data due to their ability to capture long-term dependencies.
+Conventionally, the update based on current inputs and the previous state
+history is each multiplied with dynamic weights and combined to compute the
+next state. However, multiplication can be computationally expensive,
+especially for certain hardware architectures or alternative arithmetic systems
+such as homomorphic encryption. It is demonstrated that the novel gating
+mechanism can capture long-term dependencies for a standard synthetic sequence
+learning task while significantly reducing computational costs such that
+execution time is reduced by half on CPU and by one-third under encryption.
+Experimental results on handwritten text recognition tasks furthermore show
+that the proposed architecture can be trained to achieve comparable accuracy to
+conventional GRU and LSTM baselines. The gating mechanism introduced in this
+paper may enable privacy-preserving AI applications operating under homomorphic
+encryption by avoiding the multiplication of encrypted variables. It can also
+support quantization in (unencrypted) plaintext applications, with the
+potential for substantial performance gains since the addition-based
+formulation can avoid the expansion to double precision often required for
+multiplication.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Normalized Gradients for All 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Orabona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this short note, I show how to adapt to H\"{o}lder smoothness using
+normalized gradients in a black-box way. Moreover, the bound will depend on a
+novel notion of local H\"{o}lder smoothness. The main idea directly comes from
+Levy [2017].
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Updating Clinical Risk Stratification Models Using Rank-Based
+  Compatibility: Approaches for Evaluating and Optimizing Clinician-Model Team
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erkin Ötleş, Brian T. Denton, Jenna Wiens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As data shift or new data become available, updating clinical machine
+learning models may be necessary to maintain or improve performance over time.
+However, updating a model can introduce compatibility issues when the behavior
+of the updated model does not align with user expectations, resulting in poor
+user-model team performance. Existing compatibility measures depend on model
+decision thresholds, limiting their applicability in settings where models are
+used to generate rankings based on estimated risk. To address this limitation,
+we propose a novel rank-based compatibility measure, $C^R$, and a new loss
+function that aims to optimize discriminative performance while encouraging
+good compatibility. Applied to a case study in mortality risk stratification
+leveraging data from MIMIC, our approach yields more compatible models while
+maintaining discriminative performance compared to existing model selection
+techniques, with an increase in $C^R$ of $0.019$ ($95\%$ confidence interval:
+$0.005$, $0.035$). This work provides new tools to analyze and update risk
+stratification models used in clinical care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference paper accepted at the 2023 Machine Learning for Healthcare
+  Conference Includes supplemental: 32 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-graph Spatio-temporal Graph Convolutional Network for Traffic Flow
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilong Ding, Tianpu Zhang, Jianwu Wang, Zhuofeng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inter-city highway transportation is significant for urban life. As one of
+the key functions in intelligent transportation system (ITS), traffic
+evaluation always plays significant role nowadays, and daily traffic flow
+prediction still faces challenges at network-wide toll stations. On the one
+hand, the data imbalance in practice among various locations deteriorates the
+performance of prediction. On the other hand, complex correlative
+spatio-temporal factors cannot be comprehensively employed in long-term
+duration. In this paper, a prediction method is proposed for daily traffic flow
+in highway domain through spatio-temporal deep learning. In our method, data
+normalization strategy is used to deal with data imbalance, due to long-tail
+distribution of traffic flow at network-wide toll stations. And then, based on
+graph convolutional network, we construct networks in distinct semantics to
+capture spatio-temporal features. Beside that, meteorology and calendar
+features are used by our model in the full connection stage to extra external
+characteristics of traffic flow. By extensive experiments and case studies in
+one Chinese provincial highway, our method shows clear improvement in
+predictive accuracy than baselines and practical benefits in business.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edouard Yvinec, Arnaud Dapogny, Kevin Bailly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network (DNN) deployment has been confined to larger hardware
+devices due to their expensive computational requirements. This challenge has
+recently reached another scale with the emergence of large language models
+(LLMs). In order to reduce both their memory footprint and latency, a promising
+technique is quantization. It consists in converting floating point
+representations to low bit-width fixed point representations, usually by
+assuming a uniform mapping onto a regular grid. This process, referred to in
+the literature as uniform quantization, may however be ill-suited as most DNN
+weights and activations follow a bell-shaped distribution. This is even worse
+on LLMs whose weight distributions are known to exhibit large, high impact,
+outlier values. In this work, we propose an improvement over the most commonly
+adopted way to tackle this limitation in deep learning models quantization,
+namely, non-uniform quantization. NUPES leverages automorphisms to preserve the
+scalar multiplications. Such transformations are derived from power functions.
+However, the optimization of the exponent parameter and weight values remains a
+challenging and novel problem which could not be solved with previous post
+training optimization techniques which only learn to round up or down weight
+values in order to preserve the predictive function. We circumvent this
+limitation with a new paradigm: learning new quantized weights over the entire
+quantized space. Similarly, we enable the optimization of the power exponent,
+i.e. the optimization of the quantization operator itself during training by
+alleviating all the numerical instabilities. The resulting predictive function
+is compatible with integer-only low-bit inference. We show the ability of the
+method to achieve state-of-the-art compression rates in both, data-free and
+data-driven configurations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry Defense Against XGBoost Adversarial Perturbation Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Blerta Lindqvist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We examine whether symmetry can be used to defend tree-based ensemble
+classifiers such as gradient-boosting decision trees (GBDTs) against
+adversarial perturbation attacks. The idea is based on a recent symmetry
+defense for convolutional neural network classifiers (CNNs) that utilizes CNNs'
+lack of invariance with respect to symmetries. CNNs lack invariance because
+they can classify a symmetric sample, such as a horizontally flipped image,
+differently from the original sample. CNNs' lack of invariance also means that
+CNNs can classify symmetric adversarial samples differently from the incorrect
+classification of adversarial samples. Using CNNs' lack of invariance, the
+recent CNN symmetry defense has shown that the classification of symmetric
+adversarial samples reverts to the correct sample classification. In order to
+apply the same symmetry defense to GBDTs, we examine GBDT invariance and are
+the first to show that GBDTs also lack invariance with respect to symmetries.
+We apply and evaluate the GBDT symmetry defense for nine datasets against six
+perturbation attacks with a threat model that ranges from zero-knowledge to
+perfect-knowledge adversaries. Using the feature inversion symmetry against
+zero-knowledge adversaries, we achieve up to 100% accuracy on adversarial
+samples even when default and robust classifiers have 0% accuracy. Using the
+feature inversion and horizontal flip symmetries against perfect-knowledge
+adversaries, we achieve up to over 95% accuracy on adversarial samples for the
+GBDT classifier of the F-MNIST dataset even when default and robust classifiers
+have 0% accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoGluon-TimeSeries: AutoML for Probabilistic Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleksandr Shchur, Caner Turkmen, Nick Erickson, Huibin Shen, Alexander Shirkov, Tony Hu, Yuyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce AutoGluon-TimeSeries - an open-source AutoML library for
+probabilistic time series forecasting. Focused on ease of use and robustness,
+AutoGluon-TimeSeries enables users to generate accurate point and quantile
+forecasts with just 3 lines of Python code. Built on the design philosophy of
+AutoGluon, AutoGluon-TimeSeries leverages ensembles of diverse forecasting
+models to deliver high accuracy within a short training time.
+AutoGluon-TimeSeries combines both conventional statistical models,
+machine-learning based forecasting approaches, and ensembling techniques. In
+our evaluation on 29 benchmark datasets, AutoGluon-TimeSeries demonstrates
+strong empirical performance, outperforming a range of forecasting methods in
+terms of both point and quantile forecast accuracy, and often even improving
+upon the best-in-hindsight combination of prior methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at AutoML Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Variational Inference for Large Skew-t Copulas with
+  Application to Intraday Equity Returns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Deng, Michael Stanley Smith, Worapree Maneesoonthorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large skew-t factor copula models are attractive for the modeling of
+financial data because they allow for asymmetric and extreme tail dependence.
+We show that the copula implicit in the skew-t distribution of Azzalini and
+Capitanio (2003) allows for a higher level of pairwise asymmetric dependence
+than two popular alternative skew-t copulas. Estimation of this copula in high
+dimensions is challenging, and we propose a fast and accurate Bayesian
+variational inference (VI) approach to do so. The method uses a conditionally
+Gaussian generative representation of the skew-t distribution to define an
+augmented posterior that can be approximated accurately. A fast stochastic
+gradient ascent algorithm is used to solve the variational optimization. The
+new methodology is used to estimate copula models for intraday returns from
+2017 to 2021 on 93 U.S. equities. The copula captures substantial heterogeneity
+in asymmetric dependence over equity pairs, in addition to the variability in
+pairwise correlations. We show that intraday predictive densities from the
+skew-t copula are more accurate than from some other copula models, while
+portfolio selection strategies based on the estimated pairwise tail
+dependencies improve performance relative to the benchmark index.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Critical Points ++: An Agile Point Cloud Importance Measure for Robust
+  Classification, Adversarial Defense and Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meir Yossef Levi, Guy Gilboa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to cope accurately and fast with Out-Of-Distribution (OOD)
+samples is crucial in real-world safety demanding applications. In this work we
+first study the interplay between critical points of 3D point clouds and OOD
+samples. Our findings are that common corruptions and outliers are often
+interpreted as critical points. We generalize the notion of critical points
+into importance measures. We show that training a classification network based
+only on less important points dramatically improves robustness, at a cost of
+minor performance loss on the clean set. We observe that normalized entropy is
+highly informative for corruption analysis. An adaptive threshold based on
+normalized entropy is suggested for selecting the set of uncritical points. Our
+proposed importance measure is extremely fast to compute. We show it can be
+used for a variety of applications, such as Explainable AI (XAI), Outlier
+Removal, Uncertainty Estimation, Robust Classification and Adversarial Defense.
+We reach SOTA results on the two latter tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Models Matter: The Impact of Single-Step Retrosynthesis on Synthesis
+  Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paula Torren-Peraire, Alan Kai Hassen, Samuel Genheden, Jonas Verhoeven, Djork-Arne Clevert, Mike Preuss, Igor Tetko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrosynthesis consists of breaking down a chemical compound recursively
+step-by-step into molecular precursors until a set of commercially available
+molecules is found with the goal to provide a synthesis route. Its two primary
+research directions, single-step retrosynthesis prediction, which models the
+chemical reaction logic, and multi-step synthesis planning, which tries to find
+the correct sequence of reactions, are inherently intertwined. Still, this
+connection is not reflected in contemporary research. In this work, we combine
+these two major research directions by applying multiple single-step
+retrosynthesis models within multi-step synthesis planning and analyzing their
+impact using public and proprietary reaction data. We find a disconnection
+between high single-step performance and potential route-finding success,
+suggesting that single-step models must be evaluated within synthesis planning
+in the future. Furthermore, we show that the commonly used single-step
+retrosynthesis benchmark dataset USPTO-50k is insufficient as this evaluation
+task does not represent model performance and scalability on larger and more
+diverse datasets. For multi-step synthesis planning, we show that the choice of
+the single-step model can improve the overall success rate of synthesis
+planning by up to +28% compared to the commonly used baseline model. Finally,
+we show that each single-step model finds unique synthesis routes, and differs
+in aspects such as route-finding success, the number of found synthesis routes,
+and chemical validity, making the combination of single-step retrosynthesis
+prediction and multi-step synthesis planning a crucial aspect when developing
+future methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The following authors contributed equally: Paula Torren-Peraire, Alan
+  Kai Hassen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Optimal Expressive Power of ReLU DNNs and Its Application in
+  Approximation with Kolmogorov Superposition Theorem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncai He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to studying the optimal expressive power of ReLU deep
+neural networks (DNNs) and its application in approximation via the Kolmogorov
+Superposition Theorem. We first constructively prove that any continuous
+piecewise linear functions on $[0,1]$, comprising $O(N^2L)$ segments, can be
+represented by ReLU DNNs with $L$ hidden layers and $N$ neurons per layer.
+Subsequently, we demonstrate that this construction is optimal regarding the
+parameter count of the DNNs, achieved through investigating the shattering
+capacity of ReLU DNNs. Moreover, by invoking the Kolmogorov Superposition
+Theorem, we achieve an enhanced approximation rate for ReLU DNNs of arbitrary
+width and depth when dealing with continuous functions in high-dimensional
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quality Diversity under Sparse Reward and Sparse Interaction:
+  Application to Grasping in Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Huber, F. Hélénon, M. Coninx, F. Ben Amar, S. Doncieux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality-Diversity (QD) methods are algorithms that aim to generate a set of
+diverse and high-performing solutions to a given problem. Originally developed
+for evolutionary robotics, most QD studies are conducted on a limited set of
+domains - mainly applied to locomotion, where the fitness and the behavior
+signal are dense. Grasping is a crucial task for manipulation in robotics.
+Despite the efforts of many research communities, this task is yet to be
+solved. Grasping cumulates unprecedented challenges in QD literature: it
+suffers from reward sparsity, behavioral sparsity, and behavior space
+misalignment. The present work studies how QD can address grasping. Experiments
+have been conducted on 15 different methods on 10 grasping domains,
+corresponding to 2 different robot-gripper setups and 5 standard objects. An
+evaluation framework that distinguishes the evaluation of an algorithm from its
+internal components has also been proposed for a fair comparison. The obtained
+results show that MAP-Elites variants that select successful solutions in
+priority outperform all the compared methods on the studied metrics by a large
+margin. We also found experimental evidence that sparse interaction can lead to
+deceptive novelty. To our knowledge, the ability to efficiently produce
+examples of grasping trajectories demonstrated in this work has no precedent in
+the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 17 figures. Draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM As DBA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanhe Zhou, Guoliang Li, Zhiyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Database administrators (DBAs) play a crucial role in managing, maintaining
+and optimizing a database system to ensure data availability, performance, and
+reliability. However, it is hard and tedious for DBAs to manage a large number
+of database instances (e.g., millions of instances on the cloud databases).
+Recently large language models (LLMs) have shown great potential to understand
+valuable documents and accordingly generate reasonable answers. Thus, we
+propose D-Bot, a LLM-based database administrator that can continuously acquire
+database maintenance experience from textual sources, and provide reasonable,
+well-founded, in-time diagnosis and optimization advice for target databases.
+This paper presents a revolutionary LLM-centric framework for database
+maintenance, including (i) database maintenance knowledge detection from
+documents and tools, (ii) tree of thought reasoning for root cause analysis,
+and (iii) collaborative diagnosis among multiple LLMs. Our preliminary
+experimental results that D-Bot can efficiently and effectively diagnose the
+root causes and our code is available at
+github.com/TsinghuaDatabaseGroup/DB-GPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Machine Learning and <span class="highlight-title">Transformer</span>-based Approaches for
+  Deceptive Text Classification: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deceptive text classification is a critical task in natural language
+processing that aims to identify deceptive or fraudulent content. This study
+presents a comparative analysis of machine learning and transformer-based
+approaches for deceptive text classification. We investigate the effectiveness
+of traditional machine learning algorithms and state-of-the-art transformer
+models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive
+text. A labeled dataset consisting of deceptive and non-deceptive texts is used
+for training and evaluation purposes. Through extensive experimentation, we
+compare the performance metrics, including accuracy, precision, recall, and F1
+score, of the different approaches. The results of this study shed light on the
+strengths and limitations of machine learning and transformer-based methods for
+deceptive text classification, enabling researchers and practitioners to make
+informed decisions when dealing with deceptive content
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Efficient Algorithm for Nonstationary Low-Rank MDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Cheng, Jing Yang, Yingbin Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) under changing environment models many real-world
+applications via nonstationary Markov Decision Processes (MDPs), and hence
+gains considerable interest. However, theoretical studies on nonstationary MDPs
+in the literature have mainly focused on tabular and linear (mixture) MDPs,
+which do not capture the nature of unknown representation in deep RL. In this
+paper, we make the first effort to investigate nonstationary RL under episodic
+low-rank MDPs, where both transition kernels and rewards may vary over time,
+and the low-rank model contains unknown representation in addition to the
+linear state embedding function. We first propose a parameter-dependent policy
+optimization algorithm called PORTAL, and further improve PORTAL to its
+parameter-free version of Ada-PORTAL, which is able to tune its
+hyper-parameters adaptively without any prior knowledge of nonstationarity. For
+both algorithms, we provide upper bounds on the average dynamic suboptimality
+gap, which show that as long as the nonstationarity is not significantly large,
+PORTAL and Ada-PORTAL are sample-efficient and can achieve arbitrarily small
+average dynamic suboptimality gap with polynomial sample complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\mathcal{G}^2Pxy$: Generative Open-Set Node Classification on Graphs
+  with Proxy Unknowns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qin Zhang, Zelin Shi, Xiaolin Zhang, Xiaojun Chen, Philippe Fournier-Viger, Shirui Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Node classification is the task of predicting the labels of unlabeled nodes
+in a graph. State-of-the-art methods based on graph neural networks achieve
+excellent performance when all labels are available during training. But in
+real-life, models are often applied on data with new classes, which can lead to
+massive misclassification and thus significantly degrade performance. Hence,
+developing open-set classification methods is crucial to determine if a given
+sample belongs to a known class. Existing methods for open-set node
+classification generally use transductive learning with part or all of the
+features of real unseen class nodes to help with open-set classification. In
+this paper, we propose a novel generative open-set node classification method,
+i.e. $\mathcal{G}^2Pxy$, which follows a stricter inductive learning setting
+where no information about unknown classes is available during training and
+validation. Two kinds of proxy unknown nodes, inter-class unknown proxies and
+external unknown proxies are generated via mixup to efficiently anticipate the
+distribution of novel classes. Using the generated proxies, a closed-set
+classifier can be transformed into an open-set one, by augmenting it with an
+extra proxy classifier. Under the constraints of both cross entropy loss and
+complement entropy loss, $\mathcal{G}^2Pxy$ achieves superior effectiveness for
+unknown class detection and known class classification, which is validated by
+experiments on benchmark graph datasets. Moreover, $\mathcal{G}^2Pxy$ does not
+have specific requirement on the GNN architecture and shows good
+generalizations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Forecaster's <span class="highlight-title">Review</span> of Judea Pearl's Causality: Models, Reasoning and
+  Inference, Second Edition, 2009 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the big popularity and success of Judea Pearl's original causality book,
+this review covers the main topics updated in the second edition in 2009 and
+illustrates an easy-to-follow causal inference strategy in a forecast scenario.
+It further discusses some potential benefits and challenges for causal
+inference with time series forecasting when modeling the counterfactuals,
+estimating the uncertainty and incorporating prior knowledge to estimate causal
+effects in different forecasting scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI applications in the Medical Domain: a systematic <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicoletta Prentzas, Antonis Kakas, Constantinos S. Pattichis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence in Medicine has made significant progress with
+emerging applications in medical imaging, patient care, and other areas. While
+these applications have proven successful in retrospective studies, very few of
+them were applied in practice.The field of Medical AI faces various challenges,
+in terms of building user trust, complying with regulations, using data
+ethically.Explainable AI (XAI) aims to enable humans understand AI and trust
+its results. This paper presents a literature review on the recent developments
+of XAI solutions for medical decision support, based on a representative sample
+of 198 articles published in recent years. The systematic synthesis of the
+relevant articles resulted in several findings. (1) model-agnostic XAI
+techniques were mostly employed in these solutions, (2) deep learning models
+are utilized more than other types of machine learning models, (3)
+explainability was applied to promote trust, but very few works reported the
+physicians participation in the loop, (4) visual and interactive user interface
+is more useful in understanding the explanation and the recommendation of the
+system. More research is needed in collaboration between medical and AI
+experts, that could guide the development of suitable frameworks for the
+design, implementation, and evaluation of XAI solutions in medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Assessment of Multi-view fusion learning for Crop
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mena, Diego Arenas, Marlon Nuske, Andreas Dengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With a rapidly increasing amount and diversity of remote sensing (RS) data
+sources, there is a strong need for multi-view learning modeling. This is a
+complex task when considering the differences in resolution, magnitude, and
+noise of RS data. The typical approach for merging multiple RS sources has been
+input-level fusion, but other - more advanced - fusion strategies may
+outperform this traditional approach. This work assesses different fusion
+strategies for crop classification in the CropHarvest dataset. The fusion
+methods proposed in this work outperform models based on individual views and
+previous fusion methods. We do not find one single fusion method that
+consistently outperforms all other approaches. Instead, we present a comparison
+of multi-view fusion methods for three different datasets and show that,
+depending on the test region, different methods obtain the best performance.
+Despite this, we suggest a preliminary criterion for the selection of fusion
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE International Geoscience and Remote Sensing
+  Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Product <span class="highlight-title">Review</span> Image Ranking for Fashion E-commerce <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangeet Jaiswal, Dhruv Patel, Sreekanth Vempati, Konduru Saiswaroop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a fashion e-commerce platform where customers can't physically examine the
+products on their own, being able to see other customers' text and image
+reviews of the product is critical while making purchase decisions. Given the
+high reliance on these reviews, over the years we have observed customers
+proactively sharing their reviews. With an increase in the coverage of User
+Generated Content (UGC), there has been a corresponding increase in the number
+of customer images. It is thus imperative to display the most relevant images
+on top as it may influence users' online shopping choices and behavior. In this
+paper, we propose a simple yet effective training procedure for ranking
+customer images. We created a dataset consisting of Myntra (A Major Indian
+Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)
+UGC images as our starting point and used selected distortion techniques on the
+images of the above dataset to bring their quality at par with those of bad UGC
+images. We train our network to rank bad-quality images lower than high-quality
+ones. Our proposed method outperforms the baseline models on two metrics,
+namely correlation coefficient, and accuracy, by substantial margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR
+  eCom'22)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trustworthy LLMs: a <span class="highlight-title">Survey</span> and Guideline for Evaluating Large Language
+  Models' Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Yuanshun Yao, Jean-Francois Ton, Xiaoying Zhang, Ruocheng Guo Hao Cheng, Yegor Klochkov, Muhammad Faaiz Taufiq, Hang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring alignment, which refers to making models behave in accordance with
+human intentions [1,2], has become a critical task before deploying large
+language models (LLMs) in real-world applications. For instance, OpenAI devoted
+six months to iteratively aligning GPT-4 before its release [3]. However, a
+major challenge faced by practitioners is the lack of clear guidance on
+evaluating whether LLM outputs align with social norms, values, and
+regulations. This obstacle hinders systematic iteration and deployment of LLMs.
+To address this issue, this paper presents a comprehensive survey of key
+dimensions that are crucial to consider when assessing LLM trustworthiness. The
+survey covers seven major categories of LLM trustworthiness: reliability,
+safety, fairness, resistance to misuse, explainability and reasoning, adherence
+to social norms, and robustness. Each major category is further divided into
+several sub-categories, resulting in a total of 29 sub-categories.
+Additionally, a subset of 8 sub-categories is selected for further
+investigation, where corresponding measurement studies are designed and
+conducted on several widely-used LLMs. The measurement results indicate that,
+in general, more aligned models tend to perform better in terms of overall
+trustworthiness. However, the effectiveness of alignment varies across the
+different trustworthiness categories considered. This highlights the importance
+of conducting more fine-grained analyses, testing, and making continuous
+improvements on LLM alignment. By shedding light on these key dimensions of LLM
+trustworthiness, this paper aims to provide valuable insights and guidance to
+practitioners in the field. Understanding and addressing these concerns will be
+crucial in achieving reliable and ethically sound deployment of LLMs in various
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flexible Isosurface Extraction for Gradient-Based Mesh Optimization <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchang Shen, Jacob Munkberg, Jon Hasselgren, Kangxue Yin, Zian Wang, Wenzheng Chen, Zan Gojcic, Sanja Fidler, Nicholas Sharp, Jun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work considers gradient-based mesh optimization, where we iteratively
+optimize for a 3D surface mesh by representing it as the isosurface of a scalar
+field, an increasingly common paradigm in applications including
+photogrammetry, generative modeling, and inverse physics. Existing
+implementations adapt classic isosurface extraction algorithms like Marching
+Cubes or Dual Contouring; these techniques were designed to extract meshes from
+fixed, known fields, and in the optimization setting they lack the degrees of
+freedom to represent high-quality feature-preserving meshes, or suffer from
+numerical instabilities. We introduce FlexiCubes, an isosurface representation
+specifically designed for optimizing an unknown mesh with respect to geometric,
+visual, or even physical objectives. Our main insight is to introduce
+additional carefully-chosen parameters into the representation, which allow
+local flexible adjustments to the extracted mesh geometry and connectivity.
+These parameters are updated along with the underlying scalar field via
+automatic differentiation when optimizing for a downstream task. We base our
+extraction scheme on Dual Marching Cubes for improved topological properties,
+and present extensions to optionally generate tetrahedral and
+hierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on
+both synthetic benchmarks and real-world applications, showing that it offers
+significant improvements in mesh quality and geometric fidelity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2023. Project page:
+  https://research.nvidia.com/labs/toronto-ai/flexicubes/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning aided Computer Architecture Design for CNN Inferencing
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher A. Metz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient and timely calculations of Machine Learning (ML) algorithms are
+essential for emerging technologies like autonomous driving, the Internet of
+Things (IoT), and edge computing. One of the primary ML algorithms used in such
+systems is Convolutional Neural Networks (CNNs), which demand high
+computational resources. This requirement has led to the use of ML accelerators
+like GPGPUs to meet design constraints. However, selecting the most suitable
+accelerator involves Design Space Exploration (DSE), a process that is usually
+time-consuming and requires significant manual effort. Our work presents
+approaches to expedite the DSE process by identifying the most appropriate
+GPGPU for CNN inferencing systems. We have developed a quick and precise
+technique for forecasting the power and performance of CNNs during inference,
+with a MAPE of 5.03% and 5.94%, respectively. Our approach empowers computer
+architects to estimate power and performance in the early stages of
+development, reducing the necessity for numerous prototypes. This saves time
+and money while also improving the time-to-market period.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FINER: Enhancing State-of-the-art Classifiers with Feature Attribution
+  to Facilitate Security Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiling He, Jian Lou, Zhan Qin, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning classifiers achieve state-of-the-art performance in various
+risk detection applications. They explore rich semantic representations and are
+supposed to automatically discover risk behaviors. However, due to the lack of
+transparency, the behavioral semantics cannot be conveyed to downstream
+security experts to reduce their heavy workload in security analysis. Although
+feature attribution (FA) methods can be used to explain deep learning, the
+underlying classifier is still blind to what behavior is suspicious, and the
+generated explanation cannot adapt to downstream tasks, incurring poor
+explanation fidelity and intelligibility. In this paper, we propose FINER, the
+first framework for risk detection classifiers to generate high-fidelity and
+high-intelligibility explanations. The high-level idea is to gather explanation
+efforts from model developer, FA designer, and security experts. To improve
+fidelity, we fine-tune the classifier with an explanation-guided multi-task
+learning strategy. To improve intelligibility, we engage task knowledge to
+adjust and ensemble FA methods. Extensive evaluations show that FINER improves
+explanation quality for risk detection. Moreover, we demonstrate that FINER
+outperforms a state-of-the-art tool in facilitating malware analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Preemptive Detection of Fake Accounts on Social Networks via Multi-Class
+  Preferential Attachment Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Breuer, Nazanin Khosravani, Michael Tingley, Bradford Cottel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we describe a new algorithm called Preferential Attachment
+k-class Classifier (PreAttacK) for detecting fake accounts in a social network.
+Recently, several algorithms have obtained high accuracy on this problem.
+However, they have done so by relying on information about fake accounts'
+friendships or the content they share with others--the very things we seek to
+prevent.
+  PreAttacK represents a significant departure from these approaches. We
+provide some of the first detailed distributional analyses of how new fake (and
+real) accounts first attempt to request friends after joining a major network
+(Facebook). We show that even before a new account has made friends or shared
+content, these initial friend request behaviors evoke a natural multi-class
+extension of the canonical Preferential Attachment model of social network
+growth.
+  We use this model to derive a new algorithm, PreAttacK. We prove that in
+relevant problem instances, PreAttacK near-optimally approximates the posterior
+probability that a new account is fake under this multi-class Preferential
+Attachment model of new accounts' (not-yet-answered) friend requests. These are
+the first provable guarantees for fake account detection that apply to new
+users, and that do not require strong homophily assumptions.
+  This principled approach also makes PreAttacK the only algorithm with
+provable guarantees that obtains state-of-the-art performance on new users on
+the global Facebook network, where it converges to AUC=0.9 after new users send
++ receive a total of just 20 not-yet-answered friend requests. For comparison,
+state-of-the-art benchmarks do not obtain this AUC even after observing
+additional data on new users' first 100 friend requests. Thus, unlike
+mainstream algorithms, PreAttacK converges before the median new fake account
+has made a single friendship (accepted friend request) with a human.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RTLLM: An Open-Source Benchmark for Design RTL Generation with Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Lu, Shang Liu, Qijun Zhang, Zhiyao Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the recent success of large language models (LLMs) like ChatGPT,
+researchers start to explore the adoption of LLMs for agile hardware design,
+such as generating design RTL based on natural-language instructions. However,
+in existing works, their target designs are all relatively simple and in a
+small scale, and proposed by the authors themselves, making a fair comparison
+among different LLM solutions challenging. In addition, many prior works only
+focus on the design correctness, without evaluating the design qualities of
+generated design RTL. In this work, we propose an open-source benchmark named
+RTLLM, for generating design RTL with natural language instructions. To
+systematically evaluate the auto-generated design RTL, we summarized three
+progressive goals, named syntax goal, functionality goal, and design quality
+goal. This benchmark can automatically provide a quantitative evaluation of any
+given LLM-based solution. Furthermore, we propose an easy-to-use yet
+surprisingly effective prompt engineering technique named self-planning, which
+proves to significantly boost the performance of GPT-3.5 in our proposed
+benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenProteinSet: Training data for structural biology at scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gustaf Ahdritz, Nazim Bouatta, Sachin Kadyan, Lukas Jarosch, Daniel Berenberg, Ian Fisk, Andrew M. Watkins, Stephen Ra, Richard Bonneau, Mohammed AlQuraishi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple sequence alignments (MSAs) of proteins encode rich biological
+information and have been workhorses in bioinformatic methods for tasks like
+protein design and protein structure prediction for decades. Recent
+breakthroughs like AlphaFold2 that use transformers to attend directly over
+large quantities of raw MSAs have reaffirmed their importance. Generation of
+MSAs is highly computationally intensive, however, and no datasets comparable
+to those used to train AlphaFold2 have been made available to the research
+community, hindering progress in machine learning for proteins. To remedy this
+problem, we introduce OpenProteinSet, an open-source corpus of more than 16
+million MSAs, associated structural homologs from the Protein Data Bank, and
+AlphaFold2 protein structure predictions. We have previously demonstrated the
+utility of OpenProteinSet by successfully retraining AlphaFold2 on it. We
+expect OpenProteinSet to be broadly useful as training and validation data for
+1) diverse tasks focused on protein structure, function, and design and 2)
+large-scale multimodal machine learning research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homophily-enhanced Structure Learning for Graph Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Gu, Gaoming Yang, Sheng Zhou, Ning Ma, Jiawei Chen, Qiaoyu Tan, Meihan Liu, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph clustering is a fundamental task in graph analysis, and recent advances
+in utilizing graph neural networks (GNNs) have shown impressive results.
+Despite the success of existing GNN-based graph clustering methods, they often
+overlook the quality of graph structure, which is inherent in real-world graphs
+due to their sparse and multifarious nature, leading to subpar performance.
+Graph structure learning allows refining the input graph by adding missing
+links and removing spurious connections. However, previous endeavors in graph
+structure learning have predominantly centered around supervised settings, and
+cannot be directly applied to our specific clustering tasks due to the absence
+of ground-truth labels. To bridge the gap, we propose a novel method called
+\textbf{ho}mophily-enhanced structure \textbf{le}arning for graph clustering
+(HoLe). Our motivation stems from the observation that subtly enhancing the
+degree of homophily within the graph structure can significantly improve GNNs
+and clustering outcomes. To realize this objective, we develop two
+clustering-oriented structure learning modules, i.e., hierarchical correlation
+estimation and cluster-aware sparsification. The former module enables a more
+accurate estimation of pairwise node relationships by leveraging guidance from
+latent and clustering spaces, while the latter one generates a sparsified
+structure based on the similarity matrix and clustering assignments.
+Additionally, we devise a joint optimization approach alternating between
+training the homophily-enhanced structure learning and GNN-based clustering,
+thereby enforcing their reciprocal effects. Extensive experiments on seven
+benchmark datasets of various types and scales, across a range of clustering
+metrics, demonstrate the superiority of HoLe against state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages with 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From CNN to <span class="highlight-title">Transformer</span>: A <span class="highlight-title">Review</span> of Medical Image Segmentation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjian Yao, Jiajun Bai, Wei Liao, Yuheng Chen, Mengjuan Liu, Yao Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is an important step in medical image analysis,
+especially as a crucial prerequisite for efficient disease diagnosis and
+treatment. The use of deep learning for image segmentation has become a
+prevalent trend. The widely adopted approach currently is U-Net and its
+variants. Additionally, with the remarkable success of pre-trained models in
+natural language processing tasks, transformer-based models like TransUNet have
+achieved desirable performance on multiple medical image segmentation datasets.
+In this paper, we conduct a survey of the most representative four medical
+image segmentation models in recent years. We theoretically analyze the
+characteristics of these models and quantitatively evaluate their performance
+on two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).
+Finally, we discuss the main challenges and future trends in medical image
+segmentation. Our work can assist researchers in the related field to quickly
+establish medical segmentation models tailored to specific regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Byzantine-Robust Decentralized Stochastic Optimization with Stochastic
+  Gradient Noise-Independent Learning Error 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Peng, Weiyu Li, Qing Ling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies Byzantine-robust stochastic optimization over a
+decentralized network, where every agent periodically communicates with its
+neighbors to exchange local models, and then updates its own local model by
+stochastic gradient descent (SGD). The performance of such a method is affected
+by an unknown number of Byzantine agents, which conduct adversarially during
+the optimization process. To the best of our knowledge, there is no existing
+work that simultaneously achieves a linear convergence speed and a small
+learning error. We observe that the learning error is largely dependent on the
+intrinsic stochastic gradient noise. Motivated by this observation, we
+introduce two variance reduction methods, stochastic average gradient algorithm
+(SAGA) and loopless stochastic variance-reduced gradient (LSVRG), to
+Byzantine-robust decentralized stochastic optimization for eliminating the
+negative effect of the stochastic gradient noise. The two resulting methods,
+BRAVO-SAGA and BRAVO-LSVRG, enjoy both linear convergence speeds and stochastic
+gradient noise-independent learning errors. Such learning errors are optimal
+for a class of methods based on total variation (TV)-norm regularization and
+stochastic subgradient update. We conduct extensive numerical experiments to
+demonstrate their effectiveness under various Byzantine attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating disaster response through social media data and the
+  Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.
+  wildfire season 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihui Ma, Lingyao Li, Libby Hemphill, Gregory B. Baecher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective disaster response is critical for affected communities. Responders
+and decision-makers would benefit from reliable, timely measures of the issues
+impacting their communities during a disaster, and social media offers a
+potentially rich data source. Social media can reflect public concerns and
+demands during a disaster, offering valuable insights for decision-makers to
+understand evolving situations and optimize resource allocation. We used
+Bidirectional Encoder Representations from Transformers (BERT) topic modeling
+to cluster topics from Twitter data. Then, we conducted a temporal-spatial
+analysis to examine the distribution of these topics across different regions
+during the 2020 western U.S. wildfire season. Our results show that Twitter
+users mainly focused on three topics:"health impact," "damage," and
+"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to
+explore the magnitude and velocity of topic diffusion on Twitter. The results
+displayed a clear relationship between topic trends and wildfire propagation
+patterns. The estimated parameters obtained from the SIR model in selected
+cities revealed that residents exhibited a high level of several concerns
+during the wildfire. Our study details how the SIR model and topic modeling
+using social media data can provide decision-makers with a quantitative
+approach to measure disaster response and support their decision-making
+processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-heterogeneity Graph Few-shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengfei Ding, Yan Wang, Guanfeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, heterogeneous graph few-shot learning has been proposed to
+address the label sparsity issue in heterogeneous graphs (HGs), which contain
+various types of nodes and edges. The existing methods have achieved good
+performance by transferring generalized knowledge extracted from rich-labeled
+classes in source HG(s) to few-labeled classes in a target HG. However, these
+methods only consider the single-heterogeneity scenario where the source and
+target HGs share a fixed set of node/edge types, ignoring the more general
+scenario of cross-heterogeneity, where each HG can have a different and
+non-fixed set of node/edge types. To this end, we focus on the unexplored
+cross-heterogeneity scenario and propose a novel model for Cross-heterogeneity
+Graph Few-shot Learning, namely CGFL. In CGFL, we first extract meta-patterns
+to capture heterogeneous information and propose a multi-view heterogeneous
+graph neural network (MHGN) to learn meta-patterns across HGs. Then, we propose
+a score module to measure the informativeness of labeled samples and determine
+the transferability of each source HG. Finally, by integrating MHGN and the
+score module into a meta-learning mechanism, CGFL can effectively transfer
+generalized knowledge to predict new classes with few-labeled data. Extensive
+experiments on four real-world datasets have demonstrated the superior
+performance of CGFL over the state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Preemptive Detection of Fake Accounts on Social Networks via Multi-Class
+  Preferential Attachment Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Breuer, Nazanin Khosravani, Michael Tingley, Bradford Cottel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we describe a new algorithm called Preferential Attachment
+k-class Classifier (PreAttacK) for detecting fake accounts in a social network.
+Recently, several algorithms have obtained high accuracy on this problem.
+However, they have done so by relying on information about fake accounts'
+friendships or the content they share with others--the very things we seek to
+prevent. PreAttacK represents a significant departure from these approaches. We
+provide some of the first detailed distributional analyses of how new fake (and
+real) accounts first attempt to request friends after joining a major network
+(Facebook). We show that even before a new account has made friends or shared
+content, these initial friend request behaviors evoke a natural multi-class
+extension of the canonical Preferential Attachment model of social network
+growth. We use this model to derive a new algorithm, PreAttacK. We prove that
+in relevant problem instances, PreAttacK near-optimally approximates the
+posterior probability that a new account is fake under this multi-class
+Preferential Attachment model of new accounts' (not-yet-answered) friend
+requests. These are the first provable guarantees for fake account detection
+that apply to new users, and that do not require strong homophily assumptions.
+This principled approach also makes PreAttacK the only algorithm with provable
+guarantees that obtains state-of-the-art performance on new users on the global
+Facebook network, where it converges to AUC=0.9 after new users send + receive
+a total of just 20 not-yet-answered friend requests. For comparison,
+state-of-the-art benchmarks do not obtain this AUC even after observing
+additional data on new users' first 100 friend requests. Thus, unlike
+mainstream algorithms, PreAttacK converges before the median new fake account
+has made a single friendship (accepted friend request) with a human.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GPLaSDI: Gaussian Process-based Interpretable Latent Space Dynamics
+  Identification through Deep Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christophe Bonneville, Youngsoo Choi, Debojyoti Ghosh, Jonathan L. Belof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerically solving partial differential equations (PDEs) can be challenging
+and computationally expensive. This has led to the development of reduced-order
+models (ROMs) that are accurate but faster than full order models (FOMs).
+Recently, machine learning advances have enabled the creation of non-linear
+projection methods, such as Latent Space Dynamics Identification (LaSDI). LaSDI
+maps full-order PDE solutions to a latent space using autoencoders and learns
+the system of ODEs governing the latent space dynamics. By interpolating and
+solving the ODE system in the reduced latent space, fast and accurate ROM
+predictions can be made by feeding the predicted latent space dynamics into the
+decoder. In this paper, we introduce GPLaSDI, a novel LaSDI-based framework
+that relies on Gaussian process (GP) for latent space ODE interpolations. Using
+GPs offers two significant advantages. First, it enables the quantification of
+uncertainty over the ROM predictions. Second, leveraging this prediction
+uncertainty allows for efficient adaptive training through a greedy selection
+of additional training data points. This approach does not require prior
+knowledge of the underlying PDEs. Consequently, GPLaSDI is inherently
+non-intrusive and can be applied to problems without a known PDE or its
+residual. We demonstrate the effectiveness of our approach on the Burgers
+equation, Vlasov equation for plasma physics, and a rising thermal bubble
+problem. Our proposed method achieves between 200 and 100,000 times speed-up,
+with up to 7% relative error.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aphid Cluster Recognition and Detection in the Wild Using Deep Learning
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiao Zhang, Kaidong Li, Xiangyu Chen, Cuncong Zhong, Bo Luo, Ivan Grijalva, Brian McCornack, Daniel Flippo, Ajay Sharda, Guanghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aphid infestation poses a significant threat to crop production, rural
+communities, and global food security. While chemical pest control is crucial
+for maximizing yields, applying chemicals across entire fields is both
+environmentally unsustainable and costly. Hence, precise localization and
+management of aphids are essential for targeted pesticide application. The
+paper primarily focuses on using deep learning models for detecting aphid
+clusters. We propose a novel approach for estimating infection levels by
+detecting aphid clusters. To facilitate this research, we have captured a
+large-scale dataset from sorghum fields, manually selected 5,447 images
+containing aphids, and annotated each individual aphid cluster within these
+images. To facilitate the use of machine learning models, we further process
+the images by cropping them into patches, resulting in a labeled dataset
+comprising 151,380 image patches. Then, we implemented and compared the
+performance of four state-of-the-art object detection models (VFNet, GFLV2,
+PAA, and ATSS) on the aphid dataset. Extensive experimental results show that
+all models yield stable similar performance in terms of average precision and
+recall. We then propose to merge close neighboring clusters and remove tiny
+clusters caused by cropping, and the performance is further boosted by around
+17%. The study demonstrates the feasibility of automatically detecting and
+managing insects using machine learning models. The labeled dataset will be
+made openly available to the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composable Core-sets for Diversity Approximation on Multi-<span class="highlight-title">Dataset</span>
+  Streams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephanie Wang, Michael Flynn, Fangyu Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Core-sets refer to subsets of data that maximize some function that is
+commonly a diversity or group requirement. These subsets are used in place of
+the original data to accomplish a given task with comparable or even enhanced
+performance if biases are removed. Composable core-sets are core-sets with the
+property that subsets of the core set can be unioned together to obtain an
+approximation for the original data; lending themselves to be used for streamed
+or distributed data. Recent work has focused on the use of core-sets for
+training machine learning models. Preceding solutions such as CRAIG have been
+proven to approximate gradient descent while providing a reduced training time.
+In this paper, we introduce a core-set construction algorithm for constructing
+composable core-sets to summarize streamed data for use in active learning
+environments. If combined with techniques such as CRAIG and heuristics to
+enhance construction speed, composable core-sets could be used for real time
+training of models when the amount of sensor data is large. We provide
+empirical analysis by considering extrapolated data for the runtime of such a
+brute force algorithm. This algorithm is then analyzed for efficiency through
+averaged empirical regression and key results and improvements are suggested
+for further research on the topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting N-CNN for Clinical Practice <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Antunes Ferreira, Lucas Pereira Carlini, Gabriel de Almeida Sá Coutrin, Tatiany Marcondes Heideirich, Marina Carvalho de Moraes Barros, Ruth Guinsburg, Carlos Eduardo Thomaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the Neonatal Convolutional Neural Network (N-CNN) by
+optimizing its hyperparameters and evaluating how they affect its
+classification metrics, explainability and reliability, discussing their
+potential impact in clinical practice. We have chosen hyperparameters that do
+not modify the original N-CNN architecture, but mainly modify its learning rate
+and training regularization. The optimization was done by evaluating the
+improvement in F1 Score for each hyperparameter individually, and the best
+hyperparameters were chosen to create a Tuned N-CNN. We also applied soft
+labels derived from the Neonatal Facial Coding System, proposing a novel
+approach for training facial expression classification models for neonatal pain
+assessment. Interestingly, while the Tuned N-CNN results point towards
+improvements in classification metrics and explainability, these improvements
+did not directly translate to calibration performance. We believe that such
+insights might have the potential to contribute to the development of more
+reliable pain evaluation tools for newborns, aiding healthcare professionals in
+delivering appropriate interventions and improving patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AICAI 2023 in conjuction with MICCAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UFed-GAN: A Secure Federated Learning Framework with Constrained
+  Computation and Unlabeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Achintha Wijesinghe, Songyang Zhang, Siyu Qi, Zhi Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To satisfy the broad applications and insatiable hunger for deploying low
+latency multimedia data classification and data privacy in a cloud-based
+setting, federated learning (FL) has emerged as an important learning paradigm.
+For the practical cases involving limited computational power and only
+unlabeled data in many wireless communications applications, this work
+investigates FL paradigm in a resource-constrained and label-missing
+environment. Specifically, we propose a novel framework of UFed-GAN:
+Unsupervised Federated Generative Adversarial Network, which can capture
+user-side data distribution without local classification training. We also
+analyze the convergence and privacy of the proposed UFed-GAN. Our experimental
+results demonstrate the strong potential of UFed-GAN in addressing limited
+computational resources and unlabeled data while preserving privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Twitter Data to Determine Hurricane Category: An Experiment <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songhui Yue, Jyothsna Kondari, Aibek Musaev, Randy K. Smith, Songqing Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media posts contain an abundant amount of information about public
+opinion on major events, especially natural disasters such as hurricanes. Posts
+related to an event, are usually published by the users who live near the place
+of the event at the time of the event. Special correlation between the social
+media data and the events can be obtained using data mining approaches. This
+paper presents research work to find the mappings between social media data and
+the severity level of a disaster. Specifically, we have investigated the
+Twitter data posted during hurricanes Harvey and Irma, and attempted to find
+the correlation between the Twitter data of a specific area and the hurricane
+level in that area. Our experimental results indicate a positive correlation
+between them. We also present a method to predict the hurricane category for a
+specific area using relevant Twitter data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 6 Figures, in Proceedings of the 15th ISCRAM Conference
+  Rochester, NY, USA May 2018</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal
+  Solutions <span class="chip">NeurIPS22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Ma, Ronald Xie, Shamini Ayyadhury, Cheng Ge, Anubha Gupta, Ritu Gupta, Song Gu, Yao Zhang, Gihun Lee, Joonkee Kim, Wei Lou, Haofeng Li, Eric Upschulte, Timo Dickscheid, José Guilherme de Almeida, Yixin Wang, Lin Han, Xin Yang, Marco Labagnara, Sahand Jamal Rahi, Carly Kempster, Alice Pollitt, Leon Espinosa, Tâm Mignot, Jan Moritz Middeke, Jan-Niklas Eckardt, Wangkai Li, Zhaoyang Li, Xiaochen Cai, Bizhe Bai, Noah F. Greenwald, David Van Valen, Erin Weisbart, Beth A. Cimini, Zhuoshi Li, Chao Zuo, Oscar Brück, Gary D. Bader, Bo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell segmentation is a critical step for quantitative single-cell analysis in
+microscopy images. Existing cell segmentation methods are often tailored to
+specific modalities or require manual interventions to specify hyperparameters
+in different experimental settings. Here, we present a multi-modality cell
+segmentation benchmark, comprising over 1500 labeled images derived from more
+than 50 diverse biological experiments. The top participants developed a
+Transformer-based deep-learning algorithm that not only exceeds existing
+methods, but can also be applied to diverse microscopy images across imaging
+platforms and tissue types without manual parameter adjustments. This benchmark
+and the improved algorithm offer promising avenues for more accurate and
+versatile cell analysis in microscopy imaging.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS22 Cell Segmentation Challenge:
+  https://neurips22-cellseg.grand-challenge.org/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Propagation over Conditional Independence Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Urszula Chajewska, Harsh Shrivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional Independence (CI) graph is a special type of a Probabilistic
+Graphical Model (PGM) where the feature connections are modeled using an
+undirected graph and the edge weights show the partial correlation strength
+between the features. Since the CI graphs capture direct dependence between
+features, they have been garnering increasing interest within the research
+community for gaining insights into the systems from various domains, in
+particular discovering the domain topology. In this work, we propose algorithms
+for performing knowledge propagation over the CI graphs. Our experiments
+demonstrate that our techniques improve upon the state-of-the-art on the
+publicly available Cora and PubMed datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaborPINN: Efficient physics informed neural networks using
+  multiplicative filtered networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinquan Huang, Tariq Alkhalifah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation of the seismic wavefield by solving the Helmholtz equation is
+crucial to many practical applications, e.g., full waveform inversion.
+Physics-informed neural networks (PINNs) provide functional wavefield solutions
+represented by neural networks (NNs), but their convergence is slow. To address
+this problem, we propose a modified PINN using multiplicative filtered
+networks, which embeds some of the known characteristics of the wavefield in
+training, e.g., frequency, to achieve much faster convergence. Specifically, we
+use the Gabor basis function due to its proven ability to represent wavefields
+accurately and refer to the implementation as GaborPINN. Meanwhile, we
+incorporate prior information on the frequency of the wavefield into the design
+of the method to mitigate the influence of the discontinuity of the represented
+wavefield by GaborPINN. The proposed method achieves up to a two-magnitude
+increase in the speed of convergence as compared with conventional PINNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLShield: A Validation Based Federated Learning Framework to Defend
+  Against Poisoning Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ehsanul Kabir, Zeyu Song, Md Rafi Ur Rashid, Shagufta Mehnaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is revolutionizing how we learn from data. With its
+growing popularity, it is now being used in many safety-critical domains such
+as autonomous vehicles and healthcare. Since thousands of participants can
+contribute in this collaborative setting, it is, however, challenging to ensure
+security and reliability of such systems. This highlights the need to design FL
+systems that are secure and robust against malicious participants' actions
+while also ensuring high utility, privacy of local data, and efficiency. In
+this paper, we propose a novel FL framework dubbed as FLShield that utilizes
+benign data from FL participants to validate the local models before taking
+them into account for generating the global model. This is in stark contrast
+with existing defenses relying on server's access to clean datasets -- an
+assumption often impractical in real-life scenarios and conflicting with the
+fundamentals of FL. We conduct extensive experiments to evaluate our FLShield
+framework in different settings and demonstrate its effectiveness in thwarting
+various types of poisoning and backdoor attacks including a defense-aware one.
+FLShield also preserves privacy of local data against gradient inversion
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RobustPdM: Designing Robust Predictive Maintenance against Adversarial
+  Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10822v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10822v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayesha Siddique, Ripan Kumar Kundu, Gautam Raj Mode, Khaza Anuarul Hoque
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art predictive maintenance (PdM) techniques have shown great
+success in reducing maintenance costs and downtime of complicated machines
+while increasing overall productivity through extensive utilization of
+Internet-of-Things (IoT) and Deep Learning (DL). Unfortunately, IoT sensors and
+DL algorithms are both prone to cyber-attacks. For instance, DL algorithms are
+known for their susceptibility to adversarial examples. Such adversarial
+attacks are vastly under-explored in the PdM domain. This is because the
+adversarial attacks in the computer vision domain for classification tasks
+cannot be directly applied to the PdM domain for multivariate time series (MTS)
+regression tasks. In this work, we propose an end-to-end methodology to design
+adversarially robust PdM systems by extensively analyzing the effect of
+different types of adversarial attacks and proposing a novel adversarial
+defense technique for DL-enabled PdM models. First, we propose novel MTS
+Projected Gradient Descent (PGD) and MTS PGD with random restarts (PGD_r)
+attacks. Then, we evaluate the impact of MTS PGD and PGD_r along with MTS Fast
+Gradient Sign Method (FGSM) and MTS Basic Iterative Method (BIM) on Long
+Short-Term Memory (LSTM), Gated Recurrent Unit (GRU), Convolutional Neural
+Network (CNN), and Bi-directional LSTM based PdM system. Our results using
+NASA's turbofan engine dataset show that adversarial attacks can cause a severe
+defect (up to 11X) in the RUL prediction, outperforming the effectiveness of
+the state-of-the-art PdM attacks by 3X. Furthermore, we present a novel
+approximate adversarial training method to defend against adversarial attacks.
+We observe that approximate adversarial training can significantly improve the
+robustness of PdM models (up to 54X) and outperforms the state-of-the-art PdM
+defense methods by offering 3X more robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Denoised Smoothing for Certified and Adversarial Robust
+  Out-Of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14961v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14961v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Franco, Daniel Korth, Jeanette Miriam Lorenz, Karsten Roscher, Stephan Guennemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of machine learning continues to expand, the importance of
+ensuring its safety cannot be overstated. A key concern in this regard is the
+ability to identify whether a given sample is from the training distribution,
+or is an "Out-Of-Distribution" (OOD) sample. In addition, adversaries can
+manipulate OOD samples in ways that lead a classifier to make a confident
+prediction. In this study, we present a novel approach for certifying the
+robustness of OOD detection within a $\ell_2$-norm around the input, regardless
+of network architecture and without the need for specific components or
+additional training. Further, we improve current techniques for detecting
+adversarial attacks on OOD samples, while providing high levels of certified
+and adversarial robustness on in-distribution samples. The average of all OOD
+detection metrics on CIFAR10/100 shows an increase of $\sim 13 \% / 5\%$
+relative to previous approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-GOMS: Large AI-Driven Global Ocean Modeling System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Xiong, Yanfei Xiang, Hao Wu, Shuyi Zhou, Yuze Sun, Muyuan Ma, Xiaomeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocean modeling is a powerful tool for simulating the physical, chemical, and
+biological processes of the ocean, which is the foundation for marine science
+research and operational oceanography. Modern numerical ocean modeling mainly
+consists of governing equations and numerical algorithms. Nonlinear
+instability, computational expense, low reusability efficiency and high
+coupling costs have gradually become the main bottlenecks for the further
+development of numerical ocean modeling. Recently, artificial
+intelligence-based modeling in scientific computing has shown revolutionary
+potential for digital twins and scientific simulations, but the bottlenecks of
+numerical ocean modeling have not been further solved. Here, we present
+AI-GOMS, a large AI-driven global ocean modeling system, for accurate and
+efficient global ocean daily prediction. AI-GOMS consists of a backbone model
+with the Fourier-based Masked Autoencoder structure for basic ocean variable
+prediction and lightweight fine-tuning models incorporating regional
+downscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has
+achieved the best performance in 30 days of prediction for the global ocean
+basic variables with 15 depth layers at 1/4{\deg} spatial resolution. Beyond
+the good performance in statistical metrics, AI-GOMS realizes the simulation of
+mesoscale eddies in the Kuroshio region at 1/12{\deg} spatial resolution and
+ocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new
+backbone-downstream paradigm for Earth system modeling, which makes the system
+transferable, scalable and reusable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthesizing Mixed-type Electronic Health Records using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Ceritli, Ghadeer O. Ghosheh, Vinod Kumar Chauhan, Tingting Zhu, Andrew P. Creagh, David A. Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic Health Records (EHRs) contain sensitive patient information, which
+presents privacy concerns when sharing such data. Synthetic data generation is
+a promising solution to mitigate these risks, often relying on deep generative
+models such as Generative Adversarial Networks (GANs). However, recent studies
+have shown that diffusion models offer several advantages over GANs, such as
+generation of more realistic synthetic data and stable training in generating
+data modalities, including image, text, and sound. In this work, we investigate
+the potential of diffusion models for generating realistic mixed-type tabular
+EHRs, comparing TabDDPM model with existing methods on four datasets in terms
+of data quality, utility, privacy, and augmentation. Our experiments
+demonstrate that TabDDPM outperforms the state-of-the-art models across all
+evaluation metrics, except for privacy, which confirms the trade-off between
+privacy and utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Page 2, Figure 1 is updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autonomous sputter synthesis of thin film nitrides with composition
+  controlled by Bayesian optimization of optical plasma emission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11122v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11122v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davi M. Febba, Kevin R. Talley, Kendal Johnson, Stephen Schaefer, Sage R. Bauers, John S. Mangum, Rebecca W. Smaha, Andriy Zakutayev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous experimentation has emerged as an efficient approach to accelerate
+the pace of materials discovery. Although instruments for autonomous synthesis
+have become popular in molecular and polymer science, solution processing of
+hybrid materials and nanoparticles, examples of autonomous tools for physical
+vapor deposition are scarce yet important for the semiconductor industry. Here,
+we report the design and implementation of an autonomous workflow for sputter
+deposition of thin films with controlled composition, leveraging a highly
+automated sputtering reactor custom-controlled by Python, optical emission
+spectroscopy (OES), and a Bayesian optimization algorithm. We modeled film
+composition, measured by x-ray fluorescence, as a linear function of emission
+lines monitored during the co-sputtering from elemental Zn and Ti targets in
+N$_2$ atmosphere. A Bayesian control algorithm, informed by OES, navigates the
+space of sputtering power to fabricate films with user-defined composition, by
+minimizing the absolute error between desired and measured emission signals. We
+validated our approach by autonomously fabricating Zn$_x$Ti$_{1-x}$N$_y$ films
+with deviations from the targeted cation composition within relative 3.5 %,
+even for 15 nm thin films, demonstrating that the proposed approach can
+reliably synthesize thin films with specific composition and minimal human
+interference. Moreover, the proposed method can be extended to more difficult
+synthesis experiments where plasma intensity depends non-linearly on pressure,
+or the elemental sticking coefficients strongly depend on the substrate
+temperature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach roughly
+human-level accuracy on ImageNet. Human-level competence is thus achievable for
+a fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 adds an Appendix containing results with alternative scaling
+  functions; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Width and Depth Limits Commute in Residual Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00453v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00453v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soufiane Hayou, Greg Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that taking the width and depth to infinity in a deep neural network
+with skip connections, when branches are scaled by $1/\sqrt{depth}$ (the only
+nontrivial scaling), result in the same covariance structure no matter how that
+limit is taken. This explains why the standard infinite-width-then-depth
+approach provides practical insights even for networks with depth of the same
+order as width. We also demonstrate that the pre-activations, in this case,
+have Gaussian distributions which has direct applications in Bayesian deep
+learning. We conduct extensive simulations that show an excellent match with
+our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 8 figures. arXiv admin note: text overlap with
+  arXiv:2210.00688</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Image-Based Precision Medicine with Uncertainty-Aware Causal
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03829v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03829v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Durso-Finley, Jean-Pierre Falet, Raghav Mehta, Douglas L. Arnold, Nick Pawlowski, Tal Arbel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-based precision medicine aims to personalize treatment decisions based
+on an individual's unique imaging features so as to improve their clinical
+outcome. Machine learning frameworks that integrate uncertainty estimation as
+part of their treatment recommendations would be safer and more reliable.
+However, little work has been done in adapting uncertainty estimation
+techniques and validation metrics for precision medicine. In this paper, we use
+Bayesian deep learning for estimating the posterior distribution over factual
+and counterfactual outcomes on several treatments. This allows for estimating
+the uncertainty for each treatment option and for the individual treatment
+effects (ITE) between any two treatments. We train and evaluate this model to
+predict future new and enlarging T2 lesion counts on a large, multi-center
+dataset of MR brain images of patients with multiple sclerosis, exposed to
+several treatments during randomized controlled trials. We evaluate the
+correlation of the uncertainty estimate with the factual error, and, given the
+lack of ground truth counterfactual outcomes, demonstrate how uncertainty for
+the ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate
+how knowledge of uncertainty could modify clinical decision-making to improve
+individual patient and clinical trial outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From NeurODEs to AutoencODEs: a mean-field control framework for
+  width-varying Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02279v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02279v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristina Cipriani, Massimo Fornasier, Alessandro Scagliotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The connection between Residual Neural Networks (ResNets) and continuous-time
+control systems (known as NeurODEs) has led to a mathematical analysis of
+neural networks which has provided interesting results of both theoretical and
+practical significance. However, by construction, NeurODEs have been limited to
+describing constant-width layers, making them unsuitable for modeling deep
+learning architectures with layers of variable width. In this paper, we propose
+a continuous-time Autoencoder, which we call AutoencODE, based on a
+modification of the controlled field that drives the dynamics. This adaptation
+enables the extension of the mean-field control framework originally devised
+for conventional NeurODEs. In this setting, we tackle the case of low Tikhonov
+regularization, resulting in potentially non-convex cost landscapes. While the
+global results obtained for high Tikhonov regularization may not hold globally,
+we show that many of them can be recovered in regions where the loss function
+is locally convex. Inspired by our theoretical findings, we develop a training
+method tailored to this specific type of Autoencoders with residual
+connections, and we validate our approach through numerical experiments
+conducted on various examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 11 figures. Minor adjustments and new bibliographical
+  references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Gated Graph Convolutional Network for Explainable Diagnosis of
+  Alzheimer's Disease using EEG Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Klepl, Fei He, Min Wu, Daniel J. Blackburn, Ptolemaios G. Sarrigiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural network (GNN) models are increasingly being used for the
+classification of electroencephalography (EEG) data. However, GNN-based
+diagnosis of neurological disorders, such as Alzheimer's disease (AD), remains
+a relatively unexplored area of research. Previous studies have relied on
+functional connectivity methods to infer brain graph structures and used simple
+GNN architectures for the diagnosis of AD. In this work, we propose a novel
+adaptive gated graph convolutional network (AGGCN) that can provide explainable
+predictions. AGGCN adaptively learns graph structures by combining
+convolution-based node feature enhancement with a well-known correlation-based
+measure of functional connectivity. Furthermore, the gated graph convolution
+can dynamically weigh the contribution of various spatial scales. The proposed
+model achieves high accuracy in both eyes-closed and eyes-open conditions,
+indicating the stability of learned representations. Finally, we demonstrate
+that the proposed AGGCN model generates consistent explanations of its
+predictions that might be relevant for further study of AD-related alterations
+of brain networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Random Search to Bandit Learning in Metric Measure Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11509v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11509v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuying Han, Yasong Feng, Tianyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random Search is one of the most widely-used method for Hyperparameter
+Optimization, and is critical to the success of deep learning models. Despite
+its astonishing performance, little non-heuristic theory has been developed to
+describe the underlying working mechanism. This paper gives a theoretical
+accounting of Random Search. We introduce the concept of \emph{scattering
+dimension} that describes the landscape of the underlying function, and
+quantifies the performance of random search. We show that, when the environment
+is noise-free, the output of random search converges to the optimal value in
+probability at rate $ \widetilde{\mathcal{O}} \left( \left( \frac{1}{T}
+\right)^{ \frac{1}{d_s} } \right) $, where $ d_s \ge 0 $ is the scattering
+dimension of the underlying function. When the observed function values are
+corrupted by bounded $iid$ noise, the output of random search converges to the
+optimal value in probability at rate $ \widetilde{\mathcal{O}} \left( \left(
+\frac{1}{T} \right)^{ \frac{1}{d_s + 1} } \right) $. In addition, based on the
+principles of random search, we introduce an algorithm, called BLiN-MOS, for
+Lipschitz bandits in doubling metric spaces that are also endowed with a
+probability measure, and show that BLiN-MOS achieves a regret rate of order $
+\widetilde{\mathcal{O}} \left( T^{ \frac{d_z}{d_z + 1} } \right) $, where $d_z$
+is the zooming dimension of the problem instance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forecasting Irregularly Sampled Time Series using Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijaya Krishna Yalavarthi, Kiran Madhusudhanan, Randolf Sholz, Nourhan Ahmed, Johannes Burchert, Shayan Jawed, Stefan Born, Lars Schmidt-Thieme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting irregularly sampled time series with missing values is a crucial
+task for numerous real-world applications such as healthcare, astronomy, and
+climate sciences. State-of-the-art approaches to this problem rely on Ordinary
+Differential Equations (ODEs) which are known to be slow and often require
+additional features to handle missing values. To address this issue, we propose
+a novel model using Graphs for Forecasting Irregularly Sampled Time Series with
+missing values which we call GraFITi. GraFITi first converts the time series to
+a Sparsity Structure Graph which is a sparse bipartite graph, and then
+reformulates the forecasting problem as the edge weight prediction task in the
+graph. It uses the power of Graph Neural Networks to learn the graph and
+predict the target edge weights. GraFITi has been tested on 3 real-world and 1
+synthetic irregularly sampled time series dataset with missing values and
+compared with various state-of-the-art models. The experimental results
+demonstrate that GraFITi improves the forecasting accuracy by up to 17% and
+reduces the run time up to 5 times compared to the state-of-the-art forecasting
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online learning techniques for prediction of temporal tabular <span class="highlight-title">dataset</span>s
+  with regime changes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00790v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00790v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Wong, Mauricio Barahona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of deep learning to non-stationary temporal datasets can lead
+to overfitted models that underperform under regime changes. In this work, we
+propose a modular machine learning pipeline for ranking predictions on temporal
+panel datasets which is robust under regime changes. The modularity of the
+pipeline allows the use of different models, including Gradient Boosting
+Decision Trees (GBDTs) and Neural Networks, with and without feature
+engineering. We evaluate our framework on financial data for stock portfolio
+prediction, and find that GBDT models with dropout display high performance,
+robustness and generalisability with reduced complexity and computational cost.
+We then demonstrate how online learning techniques, which require no retraining
+of models, can be used post-prediction to enhance the results. First, we show
+that dynamic feature projection improves robustness by reducing drawdown in
+regime changes. Second, we demonstrate that dynamical model ensembling based on
+selection of models with good recent performance leads to improved Sharpe and
+Calmar ratios of out-of-sample predictions. We also evaluate the robustness of
+our pipeline across different data splits and random seeds with good
+reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with
+  Distinct Inlier Categories <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05011v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05011v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Pérez-Carrasco, Guillermo Cabrera-Vives, Lorena Hernández-García, Francisco Forster, Paula Sánchez-Sáez, Alejandra Muñoz Arancibia, Nicolás Astorga, Franz Bauer, Amelia Bayo, Martina Cádiz-Leyton, Marcio Catelan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing volume of astronomical data generated by modern survey
+telescopes, automated pipelines and machine learning techniques have become
+crucial for analyzing and extracting knowledge from these datasets. Anomaly
+detection, i.e. the task of identifying irregular or unexpected patterns in the
+data, is a complex challenge in astronomy. In this paper, we propose
+Multi-Class Deep Support Vector Data Description (MCDSVDD), an extension of the
+state-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically
+designed to handle different inlier categories with distinct data
+distributions. MCDSVDD uses a neural network to map the data into hyperspheres,
+where each hypersphere represents a specific inlier category. The distance of
+each sample from the centers of these hyperspheres determines the anomaly
+score. We evaluate the effectiveness of MCDSVDD by comparing its performance
+with several anomaly detection algorithms on a large dataset of astronomical
+light-curves obtained from the Zwicky Transient Facility. Our results
+demonstrate the efficacy of MCDSVDD in detecting anomalous sources while
+leveraging the presence of different inlier categories. The code and the data
+needed to reproduce our results are publicly available at
+https://github.com/mperezcarrasco/AnomalyALeRCE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023 Workshop on Machine Learning for Astrophysics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep incremental learning models for financial temporal tabular <span class="highlight-title">dataset</span>s
+  with distribution shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07925v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07925v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Wong, Mauricio Barahona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a robust deep incremental learning framework for regression tasks
+on financial temporal tabular datasets which is built upon the incremental use
+of commonly available tabular and time series prediction models to adapt to
+distributional shifts typical of financial datasets. The framework uses a
+simple basic building block (decision trees) to build self-similar models of
+any required complexity to deliver robust performance under adverse situations
+such as regime changes, fat-tailed distributions, and low signal-to-noise
+ratios. As a detailed study, we demonstrate our scheme using XGBoost models
+trained on the Numerai dataset and show that a two layer deep ensemble of
+XGBoost models over different model snapshots delivers high quality predictions
+under different market regimes. We also show that the performance of XGBoost
+models with different number of boosting rounds in three scenarios (small,
+standard and large) is monotonically increasing with respect to model size and
+converges towards the generalisation upper bound. We also evaluate the
+robustness of the model under variability of different hyperparameters, such as
+model complexity and data sampling settings. Our model has low hardware
+requirements as no specialised neural architectures are used and each base
+model can be independently trained in parallel.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A hybrid deep-learning-metaheuristic framework for bi-level network
+  design problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06024v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06024v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahman Madadi, Goncalo Homem de Almeida Correia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a hybrid deep-learning-metaheuristic framework with a
+bi-level architecture for road network design problems (NDPs). We train a graph
+neural network (GNN) to approximate the solution of the user equilibrium (UE)
+traffic assignment problem and use inferences made by the trained model to
+calculate fitness function evaluations of a genetic algorithm (GA) to
+approximate solutions for NDPs. Using three test networks, two NDP variants and
+an exact solver as benchmark, we show that on average, our proposed framework
+can provide solutions within 1.5% gap of the best results in less than 0.5% of
+the time used by the exact solution procedure. Our framework can be utilized
+within an expert system for infrastructure planning to determine the best
+infrastructure planning and management decisions under different scenarios.
+Given the flexibility of the framework, it can easily be adapted to many other
+decision problems that can be modeled as bi-level problems on graphs. Moreover,
+we foreseen interesting future research directions, thus we also put forward a
+brief research agenda for this topic. The key observation from our research
+that can shape future research is that the fitness function evaluation time
+using the inferences made by the GNN model was in the order of milliseconds,
+which points to an opportunity and a need for novel heuristics that 1) can cope
+well with noisy fitness function values provided by deep learning models, and
+2) can use the significantly enlarged efficiency of the evaluation step to
+explore the search space effectively (rather than efficiently). This opens a
+new avenue for a modern class of metaheuristics that are crafted for use with
+AI-powered predictors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Two case studies added, intro, discussion and conclusion extended,
+  details added to method and experiments, typos fixed, title revised,
+  references added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symmetry Defense Against CNN Adversarial Perturbation Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Blerta Lindqvist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper uses symmetry to make Convolutional Neural Network classifiers
+(CNNs) robust against adversarial perturbation attacks. Such attacks add
+perturbation to original images to generate adversarial images that fool
+classifiers such as road sign classifiers of autonomous vehicles. Although
+symmetry is a pervasive aspect of the natural world, CNNs are unable to handle
+symmetry well. For example, a CNN can classify an image differently from its
+mirror image. For an adversarial image that misclassifies with a wrong label
+$l_w$, CNN inability to handle symmetry means that a symmetric adversarial
+image can classify differently from the wrong label $l_w$. Further than that,
+we find that the classification of a symmetric adversarial image reverts to the
+correct label. To classify an image when adversaries are unaware of the
+defense, we apply symmetry to the image and use the classification label of the
+symmetric image. To classify an image when adversaries are aware of the
+defense, we use mirror symmetry and pixel inversion symmetry to form a symmetry
+group. We apply all the group symmetries to the image and decide on the output
+label based on the agreement of any two of the classification labels of the
+symmetry images. Adaptive attacks fail because they need to rely on loss
+functions that use conflicting CNN output values for symmetric images. Without
+attack knowledge, the proposed symmetry defense succeeds against both
+gradient-based and random-search attacks, with up to near-default accuracies
+for ImageNet. The defense even improves the classification accuracy of original
+images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Functional Neural Networks: Shift invariant models for functional data
+  with applications to EEG classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Heinrichs, Mavin Heim, Corinna Weber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is desirable for statistical models to detect signals of interest
+independently of their position. If the data is generated by some smooth
+process, this additional structure should be taken into account. We introduce a
+new class of neural networks that are shift invariant and preserve smoothness
+of the data: functional neural networks (FNNs). For this, we use methods from
+functional data analysis (FDA) to extend multi-layer perceptrons and
+convolutional neural networks to functional data. We propose different model
+architectures, show that the models outperform a benchmark model from FDA in
+terms of accuracy and successfully use FNNs to classify electroencephalography
+(EEG) data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forward-Forward Training of an Optical Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19170v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19170v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilker Oguz, Junjie Ke, Qifei Wang, Feng Yang, Mustafa Yildirim, Niyazi Ulas Dinc, Jih-Liang Hsieh, Christophe Moser, Demetri Psaltis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks (NN) have demonstrated remarkable capabilities in various
+tasks, but their computation-intensive nature demands faster and more
+energy-efficient hardware implementations. Optics-based platforms, using
+technologies such as silicon photonics and spatial light modulators, offer
+promising avenues for achieving this goal. However, training multiple trainable
+layers in tandem with these physical systems poses challenges, as they are
+difficult to fully characterize and describe with differentiable functions,
+hindering the use of error backpropagation algorithm. The recently introduced
+Forward-Forward Algorithm (FFA) eliminates the need for perfect
+characterization of the learning system and shows promise for efficient
+training with large numbers of programmable parameters. The FFA does not
+require backpropagating an error signal to update the weights, rather the
+weights are updated by only sending information in one direction. The local
+loss function for each set of trainable weights enables low-power analog
+hardware implementations without resorting to metaheuristic algorithms or
+reinforcement learning. In this paper, we present an experiment utilizing
+multimode nonlinear wave propagation in an optical fiber demonstrating the
+feasibility of the FFA approach using an optical system. The results show that
+incorporating optical transforms in multilayer NN architectures trained with
+the FFA, can lead to performance improvements, even with a relatively small
+number of trainable weights. The proposed method offers a new path to the
+challenge of training optical NNs and provides insights into leveraging
+physical transformations for enhancing NN performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Generative Models for Learning Stochastic Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10382v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10382v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salvatore Certo, Anh Pham, Nicolas Robles, Andrew Vlasic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A framework to learn a multi-modal distribution is proposed, denoted as the
+Conditional Quantum Generative Adversarial Network (C-qGAN). The neural network
+structure is strictly within a quantum circuit and, as a consequence, is shown
+to represent a more efficient state preparation procedure than current methods.
+This methodology has the potential to speed-up algorithms, such as Monte Carlo
+analysis. In particular, after demonstrating the effectiveness of the network
+in the learning task, the technique is applied to price Asian option
+derivatives, providing the foundation for further research on other
+path-dependent options.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and
+  Class-balanced Pseudo-Labeling <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoxiao Chen, Yadan Luo, Zheng Wang, Mahsa Baktashmotlagh, Zi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (DA) with the aid of pseudo labeling
+techniques has emerged as a crucial approach for domain-adaptive 3D object
+detection. While effective, existing DA methods suffer from a substantial drop
+in performance when applied to a multi-class training setting, due to the
+co-existence of low-quality pseudo labels and class imbalance issues. In this
+paper, we address this challenge by proposing a novel ReDB framework tailored
+for learning to detect all classes at once. Our approach produces Reliable,
+Diverse, and class-Balanced pseudo 3D boxes to iteratively guide the
+self-training on a distributionally different target domain. To alleviate
+disruptions caused by the environmental discrepancy (e.g., beam numbers), the
+proposed cross-domain examination (CDE) assesses the correctness of pseudo
+labels by copy-pasting target instances into a source environment and measuring
+the prediction consistency. To reduce computational overhead and mitigate the
+object shift (e.g., scales and point densities), we design an overlapped boxes
+counting (OBC) metric that allows to uniformly downsample pseudo-labeled
+objects across different geometric characteristics. To confront the issue of
+inter-class imbalance, we progressively augment the target point clouds with a
+class-balanced set of pseudo-labeled target instances and source objects, which
+boosts recognition accuracies on both frequently appearing and rare classes.
+Experimental results on three benchmark datasets using both voxel-based (i.e.,
+SECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our
+proposed ReDB approach outperforms existing 3D domain adaptation methods by a
+large margin, improving 23.15% mAP on the nuScenes $\rightarrow$ KITTI task.
+The code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Brief <span class="highlight-title">Review</span> of Hypernetworks in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06955v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06955v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinod Kumar Chauhan, Jiandong Zhou, Ping Lu, Soheila Molaei, David A. Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hypernetworks, or hypernets in short, are neural networks that generate
+weights for another neural network, known as the target network. They have
+emerged as a powerful deep learning technique that allows for greater
+flexibility, adaptability, dynamism, faster training, information sharing, and
+model compression etc. Hypernets have shown promising results in a variety of
+deep learning problems, including continual learning, causal inference,
+transfer learning, weight pruning, uncertainty quantification, zero-shot
+learning, natural language processing, and reinforcement learning etc. Despite
+their success across different problem settings, currently, there is no review
+available to inform the researchers about the developments and to help in
+utilizing hypernets. To fill this gap, we review the progress in hypernets. We
+present an illustrative example to train deep neural networks using hypernets
+and propose categorizing hypernets based on five design criteria as inputs,
+outputs, variability of inputs and outputs, and architecture of hypernets. We
+also review applications of hypernets across different deep learning problem
+settings, followed by a discussion of general scenarios where hypernets can be
+effectively employed. Finally, we discuss the challenges and future directions
+that remain under-explored in the field of hypernets. We believe that
+hypernetworks have the potential to revolutionize the field of deep learning.
+They offer a new way to design and train neural networks, and they have the
+potential to improve the performance of deep learning models on a variety of
+tasks. Through this review, we aim to inspire further advancements in deep
+learning through hypernetworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revised categorisation, added new Section '5 When can we use
+  Hypernets?', and other corrections(2 figures and 2 tables) (under review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InfoNCE is variational inference in a recognition parameterised model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.02495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.02495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurence Aitchison, Stoil Ganev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Here, we show that the InfoNCE objective is equivalent to the ELBO in a new
+class of probabilistic generative model, the recognition parameterised model
+(RPM). When we learn the optimal prior, the RPM ELBO becomes equal to the
+mutual information (MI; up to a constant), establishing a connection to
+pre-existing self-supervised learning methods such as InfoNCE. However,
+practical InfoNCE methods do not use the MI as an objective; the MI is
+invariant to arbitrary invertible transformations, so using an MI objective can
+lead to highly entangled representations (Tschannen et al., 2019). Instead, the
+actual InfoNCE objective is a simplified lower bound on the MI which is loose
+even in the infinite sample limit. Thus, an objective that works (i.e. the
+actual InfoNCE objective) appears to be motivated as a loose bound on an
+objective that does not work (i.e. the true MI which gives arbitrarily
+entangled representations). We give an alternative motivation for the actual
+InfoNCE objective. In particular, we show that in the infinite sample limit,
+and for a particular choice of prior, the actual InfoNCE objective is equal to
+the ELBO (up to a constant); and the ELBO is equal to the marginal likelihood
+with a deterministic recognition model. Thus, we argue that our VAE perspective
+gives a better motivation for InfoNCE than MI, as the actual InfoNCE objective
+is only loosely bounded by the MI, but is equal to the ELBO/marginal likelihood
+(up to a constant).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying Momentum-based Positive-definite Submanifold Optimization
+  with Applications to Deep Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09738v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09738v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wu Lin, Valentin Duruisseaux, Melvin Leok, Frank Nielsen, Mohammad Emtiyaz Khan, Mark Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Riemannian submanifold optimization with momentum is computationally
+challenging because, to ensure that the iterates remain on the submanifold, we
+often need to solve difficult differential equations. Here, we simplify such
+difficulties for a class of sparse or structured symmetric positive-definite
+matrices with the affine-invariant metric. We do so by proposing a generalized
+version of the Riemannian normal coordinates that dynamically orthonormalizes
+the metric and locally converts the problem into an unconstrained problem in
+the Euclidean space. We use our approach to simplify existing approaches for
+structured covariances and develop matrix-inverse-free $2^\text{nd}$-order
+optimizers for deep learning with low precision by using only matrix
+multiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An updated version of the ICML 2023 paper. Updated the main text to
+  emphasize challenges of using existing Riemannian methods to estimate sparse
+  and structured SPD matrices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Π-ML: A dimensional analysis-based machine learning parameterization
+  of optical turbulence in the atmospheric surface layer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Pierzyna, Rudolf Saathof, Sukanta Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Turbulent fluctuations of the atmospheric refraction index, so-called optical
+turbulence, can significantly distort propagating laser beams. Therefore,
+modeling the strength of these fluctuations ($C_n^2$) is highly relevant for
+the successful development and deployment of future free-space optical
+communication links. In this letter, we propose a physics-informed machine
+learning (ML) methodology, $\Pi$-ML, based on dimensional analysis and gradient
+boosting to estimate $C_n^2$. Through a systematic feature importance analysis,
+we identify the normalized variance of potential temperature as the dominating
+feature for predicting $C_n^2$. For statistical robustness, we train an
+ensemble of models which yields high performance on the out-of-sample data of
+$R^2=0.958\pm0.001$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super
+  Learner Equation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04365v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04365v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Vowels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference is a crucial goal of science, enabling researchers to arrive
+at meaningful conclusions regarding the predictions of hypothetical
+interventions using observational data. Path models, Structural Equation Models
+(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to
+unambiguously specify assumptions regarding the causal structure underlying a
+phenomenon. Unlike DAGs, which make very few assumptions about the functional
+and parametric form, SEM assumes linearity. This can result in functional
+misspecification which prevents researchers from undertaking reliable effect
+size estimation. In contrast, we propose Super Learner Equation Modeling, a
+path modeling technique integrating machine learning Super Learner ensembles.
+We empirically demonstrate its ability to provide consistent and unbiased
+estimates of causal effects, its competitive performance for linear models when
+compared with SEM, and highlight its superiority over SEM when dealing with
+non-linear relationships. We provide open-source code, and a tutorial notebook
+with example usage, accentuating the easy-to-use nature of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-metrics adaptively identifies backdoors in Federated learning <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siquan Huang, Yijiang Li, Chong Chen, Leyu Shi, Ying Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The decentralized and privacy-preserving nature of federated learning (FL)
+makes it vulnerable to backdoor attacks aiming to manipulate the behavior of
+the resulting model on specific adversary-chosen inputs. However, most existing
+defenses based on statistical differences take effect only against specific
+attacks, especially when the malicious gradients are similar to benign ones or
+the data are highly non-independent and identically distributed (non-IID). In
+this paper, we revisit the distance-based defense methods and discover that i)
+Euclidean distance becomes meaningless in high dimensions and ii) malicious
+gradients with diverse characteristics cannot be identified by a single metric.
+To this end, we present a simple yet effective defense strategy with
+multi-metrics and dynamic weighting to identify backdoors adaptively.
+Furthermore, our novel defense has no reliance on predefined assumptions over
+attack settings or data distributions and little impact on benign performance.
+To evaluate the effectiveness of our approach, we conduct comprehensive
+experiments on different datasets under various attack settings, where our
+method achieves the best defensive performance. For instance, we achieve the
+lowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing
+significant superiority over previous defenses. The results also demonstrate
+that our method can be well-adapted to a wide range of non-IID degrees without
+sacrificing the benign performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International
+  Conference on Computer Vision (ICCV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning ground states of gapped quantum Hamiltonians with Kernel
+  Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clemens Giuliani, Filippo Vicentini, Riccardo Rossi, Giuseppe Carleo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network approaches to approximate the ground state of quantum
+hamiltonians require the numerical solution of a highly nonlinear optimization
+problem. We introduce a statistical learning approach that makes the
+optimization trivial by using kernel methods. Our scheme is an approximate
+realization of the power method, where supervised learning is used to learn the
+next step of the power iteration. We show that the ground state properties of
+arbitrary gapped quantum hamiltonians can be reached with polynomial resources
+under the assumption that the supervised learning is efficient. Using kernel
+ridge regression, we provide numerical evidence that the learning assumption is
+verified by applying our scheme to find the ground states of several
+prototypical interacting many-body quantum systems, both in one and two
+dimensions, showing the flexibility of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FALL-E: A Foley Sound Synthesis Model and Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsung Kang, Sangshin Oh, Hyeongi Moon, Kyungyun Lee, Ben Sangbae Chon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces FALL-E, a foley synthesis system and its
+training/inference strategies. The FALL-E model employs a cascaded approach
+comprising low-resolution spectrogram generation, spectrogram super-resolution,
+and a vocoder. We trained every sound-related model from scratch using our
+extensive datasets, and utilized a pre-trained language model. We conditioned
+the model with dataset-specific texts, enabling it to learn sound quality and
+recording environment based on text input. Moreover, we leveraged external
+language models to improve text descriptions of our datasets and performed
+prompt engineering for quality, coherence, and diversity. FALL-E was evaluated
+by an objective measure as well as listening tests in the DCASE 2023 challenge
+Task 7. The submission achieved the second place on average, while achieving
+the best score for diversity, second place for audio quality, and third place
+for class fitness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Out-of-Memory NMF on CPU/GPU Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09518v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09518v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismael Boureima, Manish Bhattarai, Maksim Eren, Erik Skau, Philip Romero, Stephan Eidenbenz, Boian Alexandrov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an efficient distributed out-of-memory implementation of the
+Non-negative Matrix Factorization (NMF) algorithm for heterogeneous
+high-performance-computing (HPC) systems. The proposed implementation is based
+on prior work on NMFk, which can perform automatic model selection and extract
+latent variables and patterns from data. In this work, we extend NMFk by adding
+support for dense and sparse matrix operation on multi-node, multi-GPU systems.
+The resulting algorithm is optimized for out-of-memory (OOM) problems where the
+memory required to factorize a given matrix is greater than the available GPU
+memory. Memory complexity is reduced by batching/tiling strategies, and sparse
+and dense matrix operations are significantly accelerated with GPU cores (or
+tensor cores when available). Input/Output (I/O) latency associated with batch
+copies between host and device is hidden using CUDA streams to overlap data
+transfers and compute asynchronously, and latency associated with collective
+communications (both intra-node and inter-node) is reduced using optimized
+NVIDIA Collective Communication Library NCCL based communicators. Benchmark
+results show significant improvement, from 32X to 76x speedup, with the new
+implementation using GPUs over the CPU-based NMFk. Good weak scaling was
+demonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000
+GPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size
+sparse matrix of density 10e-6.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Journal of Supercomputing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Progressive-Hint <span class="highlight-title">Prompt</span>ing Improves Reasoning in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09797v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09797v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyang Zheng, Zhengying Liu, Enze Xie, Zhenguo Li, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) in reasoning tasks depends
+heavily on prompt design, with Chain-of-Thought (CoT) and self-consistency
+being critical methods that enhance this ability. However, these methods do not
+fully exploit the answers generated by the LLM to guide subsequent responses.
+This paper proposes a new prompting method, named Progressive-Hint Prompting
+(PHP), that enables automatic multiple interactions between users and LLMs by
+using previously generated answers as hints to progressively guide toward the
+correct answers. PHP is orthogonal to CoT and self-consistency, making it easy
+to combine with state-of-the-art techniques to further improve performance. We
+conducted extensive and comprehensive experiments on seven benchmarks. The
+results show that PHP significantly improves accuracy while remaining highly
+efficient. For instance, with text-davinci-003, we observed a 4.2% improvement
+on GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction
+in sample paths with self-consistency. With GPT-4 and PHP, we achieve
+state-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),
+AQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Feature Set of Small Size for the PDF Malware Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04704v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04704v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Liu, Charles Nicholas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML)-based malware detection systems are becoming
+increasingly important as malware threats increase and get more sophisticated.
+PDF files are often used as vectors for phishing attacks because they are
+widely regarded as trustworthy data resources, and are accessible across
+different platforms. Therefore, researchers have developed many different PDF
+malware detection methods. Performance in detecting PDF malware is greatly
+influenced by feature selection. In this research, we propose a small features
+set that don't require too much domain knowledge of the PDF file. We evaluate
+proposed features with six different machine learning models. We report the
+best accuracy of 99.75% when using Random Forest model. Our proposed feature
+set, which consists of just 12 features, is one of the most conciseness in the
+field of PDF malware detection. Despite its modest size, we obtain comparable
+results to state-of-the-art that employ a much larger set of features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the ACM SIGKDD & Annual KDD Conference
+  workshop on Knowledge-infused Machine Learning, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Privacy Leakage in Machine Learning via Multiple Hypothesis
+  Testing: A Lesson From Fano 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13662v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13662v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan Guo, Alexandre Sablayrolles, Maziar Sanjabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential privacy (DP) is by far the most widely accepted framework for
+mitigating privacy risks in machine learning. However, exactly how small the
+privacy parameter $\epsilon$ needs to be to protect against certain privacy
+risks in practice is still not well-understood. In this work, we study data
+reconstruction attacks for discrete data and analyze it under the framework of
+multiple hypothesis testing. We utilize different variants of the celebrated
+Fano's inequality to derive upper bounds on the inferential power of a data
+reconstruction adversary when the model is trained differentially privately.
+Importantly, we show that if the underlying private data takes values from a
+set of size $M$, then the target privacy parameter $\epsilon$ can be $O(\log
+M)$ before the adversary gains significant inferential power. Our analysis
+offers theoretical evidence for the empirical effectiveness of DP against data
+reconstruction attacks even at relatively large values of $\epsilon$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Aware Compression for Federated Learning Through Numerical
+  Mechanism Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03942v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03942v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan Guo, Kamalika Chaudhuri, Pierre Stock, Mike Rabbat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In private federated learning (FL), a server aggregates differentially
+private updates from a large number of clients in order to train a machine
+learning model. The main challenge in this setting is balancing privacy with
+both classification accuracy of the learnt model as well as the number of bits
+communicated between the clients and server. Prior work has achieved a good
+trade-off by designing a privacy-aware compression mechanism, called the
+minimum variance unbiased (MVU) mechanism, that numerically solves an
+optimization problem to determine the parameters of the mechanism. This paper
+builds upon it by introducing a new interpolation procedure in the numerical
+design process that allows for a far more efficient privacy analysis. The
+result is the new Interpolated MVU mechanism that is more scalable, has a
+better privacy-utility trade-off, and provides SOTA results on
+communication-efficient private FL on a variety of datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks
+  for Defending Adversarial Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiufan Ji, Lin Wang, Cong Shi, Shengshan Hu, Yingying Chen, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to
+adversarial examples, threatening their practical deployment. Despite the many
+research endeavors have been made to tackle this issue in recent years, the
+diversity of adversarial examples on 3D point clouds makes them more
+challenging to defend against than those on 2D images. For examples, attackers
+can generate adversarial examples by adding, shifting, or removing points.
+Consequently, existing defense strategies are hard to counter unseen point
+cloud adversarial examples. In this paper, we first establish a comprehensive,
+and rigorous point cloud adversarial robustness benchmark to evaluate
+adversarial robustness, which can provide a detailed understanding of the
+effects of the defense and attack methods. We then collect existing defense
+tricks in point cloud adversarial defenses and then perform extensive and
+systematic experiments to identify an effective combination of these tricks.
+Furthermore, we propose a hybrid training augmentation methods that consider
+various types of point cloud adversarial examples to adversarial training,
+significantly improving the adversarial robustness. By combining these tricks,
+we construct a more robust defense framework achieving an average accuracy of
+83.45\% against various attacks, demonstrating its capability to enabling
+robust learners. Our codebase are open-sourced on:
+\url{https://github.com/qiufan319/benchmark_pc_attack.git}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for
+  Patricia Wu-Murad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Manzini, Robin Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper details the challenges in applying two computer vision systems, an
+EfficientDET supervised learning model and the unsupervised RX spectral
+classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and
+rescue (WSAR) effort in Japan and identifies 3 directions for future research.
+There have been at least 19 proposed approaches and 3 datasets aimed at
+locating missing persons in drone imagery, but only 3 approaches (2
+unsupervised and 1 of an unknown structure) are referenced in the literature as
+having been used in an actual WSAR operation. Of these proposed approaches, the
+EfficientDET architecture and the unsupervised spectral RX classifier were
+selected as the most appropriate for this setting. The EfficientDET model was
+applied to the HERIDAL dataset and despite achieving performance that is
+statistically equivalent to the state-of-the-art, the model fails to translate
+to the real world in terms of false positives (e.g., identifying tree limbs and
+rocks as people), and false negatives (e.g., failing to identify members of the
+search team). The poor results in practice for algorithms that showed good
+results on datasets suggest 3 areas of future research: more realistic datasets
+for wilderness SAR, computer vision models that are capable of seamlessly
+handling the variety of imagery that can be collected during actual WSAR
+operations, and better alignment on performance measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Increases Global Access to Reliable Flood Forecasts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grey Nearing, Deborah Cohen, Vusumuzi Dube, Martin Gauch, Oren Gilon, Shaun Harrigan, Avinatan Hassidim, Frederik Kratzert, Asher Metzger, Sella Nevo, Florian Pappenberger, Christel Prudhomme, Guy Shalev, Shlomo Shenzis, Tadele Tekalign, Dana Weitzner, Yoss Matias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Floods are one of the most common and impactful natural disasters, with a
+disproportionate impact in developing countries that often lack dense
+streamflow monitoring networks. Accurate and timely warnings are critical for
+mitigating flood risks, but accurate hydrological simulation models typically
+must be calibrated to long data records in each watershed where they are
+applied. We developed an Artificial Intelligence (AI) model to predict extreme
+hydrological events at timescales up to 7 days in advance. This model
+significantly outperforms current state of the art global hydrology models (the
+Copernicus Emergency Management Service Global Flood Awareness System) across
+all continents, lead times, and return periods. AI is especially effective at
+forecasting in ungauged basins, which is important because only a few percent
+of the world's watersheds have stream gauges, with a disproportionate number of
+ungauged basins in developing countries that are especially vulnerable to the
+human impacts of flooding. We produce forecasts of extreme events in South
+America and Africa that achieve reliability approaching the current state of
+the art in Europe and North America, and we achieve reliability at between 4
+and 6-day lead times that are similar to current state of the art nowcasts
+(0-day lead time). Additionally, we achieve accuracies over 10-year return
+period events that are similar to current accuracies over 2-year return period
+events, meaning that AI can provide warnings earlier and over larger and more
+impactful events. The model that we develop in this paper has been incorporated
+into an operational early warning system that produces publicly available (free
+and open) forecasts in real time in over 80 countries. This work using AI and
+open data highlights a need for increasing the availability of hydrological
+data to continue to improve global access to reliable flood warnings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Model Reprogramming with Similarity Based Mapping for
+  Low-Resource Spoken Command Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.03894v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.03894v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yen, Pin-Jui Ku, Chao-Han Huck Yang, Hu Hu, Sabato Marco Siniscalchi, Pin-Yu Chen, Yu Tsao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we propose a novel adversarial reprogramming (AR) approach for
+low-resource spoken command recognition (SCR), and build an AR-SCR system. The
+AR procedure aims to modify the acoustic signals (from the target domain) to
+repurpose a pretrained SCR model (from the source domain). To solve the label
+mismatches between source and target domains, and further improve the stability
+of AR, we propose a novel similarity-based label mapping technique to align
+classes. In addition, the transfer learning (TL) technique is combined with the
+original AR process to improve the model adaptation capability. We evaluate the
+proposed AR-SCR system on three low-resource SCR datasets, including Arabic,
+Lithuanian, and dysarthric Mandarin speech. Experimental results show that with
+a pretrained AM trained on a large-scale English dataset, the proposed AR-SCR
+system outperforms the current state-of-the-art results on Arabic and
+Lithuanian speech commands datasets, with only a limited amount of training
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Interspeech 2023. Code is available at:
+  https://github.com/dodohow1011/SpeechAdvReprogram</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Which Features are Learned by Code<span class="highlight-title">Bert</span>: An Empirical Study of the
+  <span class="highlight-title">BERT</span>-based Source Code Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08427v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08427v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lan Zhang, Chen Cao, Zhilong Wang, Peng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Bidirectional Encoder Representations from Transformers (BERT) were
+proposed in the natural language process (NLP) and shows promising results.
+Recently researchers applied the BERT to source-code representation learning
+and reported some good news on several downstream tasks. However, in this
+paper, we illustrated that current methods cannot effectively understand the
+logic of source codes. The representation of source code heavily relies on the
+programmer-defined variable and function names. We design and implement a set
+of experiments to demonstrate our conjecture and provide some insights for
+future works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 table, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative Learning with a Drone Orchestrator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02266v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02266v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Boloursaz Mashhadi, Mahnoosh Mahdavimoghadam, Rahim Tafazolli, Walid Saad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the problem of drone-assisted collaborative learning is
+considered. In this scenario, swarm of intelligent wireless devices train a
+shared neural network (NN) model with the help of a drone. Using its sensors,
+each device records samples from its environment to gather a local dataset for
+training. The training data is severely heterogeneous as various devices have
+different amount of data and sensor noise level. The intelligent devices
+iteratively train the NN on their local datasets and exchange the model
+parameters with the drone for aggregation. For this system, the convergence
+rate of collaborative learning is derived while considering data heterogeneity,
+sensor noise levels, and communication errors, then, the drone trajectory that
+maximizes the final accuracy of the trained NN is obtained. The proposed
+trajectory optimization approach is aware of both the devices data
+characteristics (i.e., local dataset size and noise level) and their wireless
+channel conditions, and significantly improves the convergence rate and final
+accuracy in comparison with baselines that only consider data characteristics
+or channel conditions. Compared to state-of-the-art baselines, the proposed
+approach achieves an average 3.85% and 3.54% improvement in the final accuracy
+of the trained NN on benchmark datasets for image recognition and semantic
+segmentation tasks, respectively. Moreover, the proposed framework achieves a
+significant speedup in training, leading to an average 24% and 87% saving in
+the drone hovering time, communication overhead, and battery usage,
+respectively for these tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paras Sheth, Tharindu Kumarage, Raha Moraffah, Aman Chadha, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms, despite their value in promoting open discourse, are
+often exploited to spread harmful content. Current deep learning and natural
+language processing models used for detecting this harmful content overly rely
+on domain-specific terms affecting their capabilities to adapt to generalizable
+hate speech detection. This is because they tend to focus too narrowly on
+particular linguistic signals or the use of certain categories of words.
+Another significant challenge arises when platforms lack high-quality annotated
+data for training, leading to a need for cross-platform models that can adapt
+to different distribution shifts. Our research introduces a cross-platform hate
+speech detection model capable of being trained on one platform's data and
+generalizing to multiple unseen platforms. To achieve good generalizability
+across platforms, one way is to disentangle the input representations into
+invariant and platform-dependent features. We also argue that learning causal
+relationships, which remain constant across diverse environments, can
+significantly aid in understanding invariant representations in hate speech. By
+disentangling input into platform-dependent features (useful for predicting
+hate targets) and platform-independent features (used to predict the presence
+of hate), we learn invariant representations resistant to distribution shifts.
+These features are then used to predict hate speech across unseen platforms.
+Our extensive experiments across four platforms highlight our model's enhanced
+efficacy compared to existing state-of-the-art methods in detecting generalized
+hate speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ There is more than one kind of robustness: Fooling Whisper with
+  adversarial examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Olivier, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whisper is a recent Automatic Speech Recognition (ASR) model displaying
+impressive robustness to both out-of-distribution inputs and random noise. In
+this work, we show that this robustness does not carry over to adversarial
+noise. We show that we can degrade Whisper performance dramatically, or even
+transcribe a target sentence of our choice, by generating very small input
+perturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling
+the Whisper language detector we can very easily degrade the performance of
+multilingual models. These vulnerabilities of a widely popular open-source
+model have practical security implications and emphasize the need for
+adversarially robust ASR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at InterSpeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How many perturbations break this model? Evaluating robustness beyond
+  adversarial accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.04129v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.04129v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Olivier, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robustness to adversarial attacks is typically evaluated with adversarial
+accuracy. While essential, this metric does not capture all aspects of
+robustness and in particular leaves out the question of how many perturbations
+can be found for each point. In this work, we introduce an alternative
+approach, adversarial sparsity, which quantifies how difficult it is to find a
+successful perturbation given both an input point and a constraint on the
+direction of the perturbation. We show that sparsity provides valuable insight
+into neural networks in multiple ways: for instance, it illustrates important
+differences between current state-of-the-art robust models them that accuracy
+analysis does not, and suggests approaches for improving their robustness. When
+applying broken defenses effective against weak attacks but not strong ones,
+sparsity can discriminate between the totally ineffective and the partially
+effective defenses. Finally, with sparsity we can measure increases in
+robustness that do not affect accuracy: we show for example that data
+augmentation can by itself increase adversarial robustness, without using
+adversarial training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Erasing with Pruned Elements: Towards Better Graph Lottery
+  Ticket <span class="chip">ECAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Wang, Shunyu Liu, Kaixuan Chen, Tongtian Zhu, Ji Qiao, Mengjie Shi, Yuanyu Wan, Mingli Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Lottery Ticket (GLT), a combination of core subgraph and sparse
+subnetwork, has been proposed to mitigate the computational cost of deep Graph
+Neural Networks (GNNs) on large input graphs while preserving original
+performance. However, the winning GLTs in exisiting studies are obtained by
+applying iterative magnitude-based pruning (IMP) without re-evaluating and
+re-considering the pruned information, which disregards the dynamic changes in
+the significance of edges/weights during graph/model structure pruning, and
+thus limits the appeal of the winning tickets. In this paper, we formulate a
+conjecture, i.e., existing overlooked valuable information in the pruned graph
+connections and model parameters which can be re-grouped into GLT to enhance
+the final performance. Specifically, we propose an adversarial complementary
+erasing (ACE) framework to explore the valuable information from the pruned
+components, thereby developing a more powerful GLT, referred to as the ACE-GLT.
+The main idea is to mine valuable information from pruned edges/weights after
+each round of IMP, and employ the ACE technique to refine the GLT processing.
+Finally, experimental results demonstrate that our ACE-GLT outperforms existing
+methods for searching GLT in diverse tasks. Our code will be made publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures, Accept by ECAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AudioLDM 2: Learning Holistic Audio Generation with <span class="highlight-title">Self-supervised</span>
+  <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohe Liu, Qiao Tian, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Yuping Wang, Wenwu Wang, Yuxuan Wang, Mark D. Plumbley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although audio generation shares commonalities across different types of
+audio, such as speech, music, and sound effects, designing models for each type
+requires careful consideration of specific objectives and biases that can
+significantly differ from those of other types. To bring us closer to a unified
+perspective of audio generation, this paper proposes a framework that utilizes
+the same learning method for speech, music, and sound effect generation. Our
+framework introduces a general representation of audio, called language of
+audio (LOA). Any audio can be translated into LOA based on AudioMAE, a
+self-supervised pre-trained representation learning model. In the generation
+process, we translate any modalities into LOA by using a GPT-2 model, and we
+perform self-supervised audio generation learning with a latent diffusion model
+conditioned on LOA. The proposed framework naturally brings advantages such as
+in-context learning abilities and reusable self-supervised pretrained AudioMAE
+and latent diffusion models. Experiments on the major benchmarks of
+text-to-audio, text-to-music, and text-to-speech demonstrate new
+state-of-the-art or competitive performance to previous approaches. Our demo
+and code are available at https://audioldm.github.io/audioldm2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AudioLDM 2 project page is https://audioldm.github.io/audioldm2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speech-Driven 3D Face Animation with Composite and Regional Facial
+  Movements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Wu, Songtao Zhou, Jia Jia, Junliang Xing, Qi Wen, Xiang Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D face animation poses significant challenges due to the
+intricacy and variability inherent in human facial movements. This paper
+emphasizes the importance of considering both the composite and regional
+natures of facial movements in speech-driven 3D face animation. The composite
+nature pertains to how speech-independent factors globally modulate
+speech-driven facial movements along the temporal dimension. Meanwhile, the
+regional nature alludes to the notion that facial movements are not globally
+correlated but are actuated by local musculature along the spatial dimension.
+It is thus indispensable to incorporate both natures for engendering vivid
+animation. To address the composite nature, we introduce an adaptive modulation
+module that employs arbitrary facial movements to dynamically adjust
+speech-driven facial movements across frames on a global scale. To accommodate
+the regional nature, our approach ensures that each constituent of the facial
+features for every frame focuses on the local spatial movements of 3D faces.
+Moreover, we present a non-autoregressive backbone for translating audio to 3D
+facial movements, which maintains high-frequency nuances of facial movements
+and facilitates efficient inference. Comprehensive experiments and user studies
+demonstrate that our method surpasses contemporary state-of-the-art approaches
+both qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MM 2023, 9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Spatio-temporal Perception for Audio-Visual Question
+  Answering <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyao Li, Wenxuan Hou, Di Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-Visual Question Answering (AVQA) task aims to answer questions about
+different visual objects, sounds, and their associations in videos. Such
+naturally multi-modal videos are composed of rich and complex dynamic
+audio-visual components, where most of which could be unrelated to the given
+questions, or even play as interference in answering the content of interest.
+Oppositely, only focusing on the question-aware audio-visual content could get
+rid of influence, meanwhile enabling the model to answer more efficiently. In
+this paper, we propose a Progressive Spatio-Temporal Perception Network
+(PSTP-Net), which contains three modules that progressively identify key
+spatio-temporal regions w.r.t. questions. Specifically, a temporal segment
+selection module is first introduced to select the most relevant audio-visual
+segments related to the given question. Then, a spatial region selection module
+is utilized to choose the most relevant regions associated with the question
+from the selected temporal segments. To further refine the selection of
+features, an audio-guided visual attention module is employed to perceive the
+association between auido and selected spatial regions. Finally, the
+spatio-temporal features from these modules are integrated for answering the
+question. Extensive experimental results on the public MUSIC-AVQA and AVQA
+datasets provide compelling evidence of the effectiveness and efficiency of
+PSTP-Net. Code is available at:
+\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Adaptive Video Streaming with Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04132v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04132v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchi Huang, Rui-Xiao Zhang, Chenglei Wu, Lifeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality of Experience~(QoE)-driven adaptive bitrate (ABR) algorithms are
+typically optimized using QoE models that are based on the mean opinion
+score~(MOS), while such principles may not account for user heterogeneity on
+rating scales, resulting in unexpected behaviors. In this paper, we propose
+Jade, which leverages reinforcement learning with human feedback~(RLHF)
+technologies to better align the users' opinion scores. Jade's rank-based QoE
+model considers relative values of user ratings to interpret the subjective
+perception of video sessions. We implement linear-based and Deep Neural Network
+(DNN)-based architectures for satisfying both accuracy and generalization
+ability. We further propose entropy-aware reinforced mechanisms for training
+policies with the integration of the proposed QoE models. Experimental results
+demonstrate that Jade performs favorably on conventional metrics, such as
+quality and stall ratio, and improves QoE by 8.09%-38.13% in different network
+conditions, emphasizing the importance of user heterogeneity in QoE modeling
+and the potential of combining linear-based and DNN-based models for
+performance improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Music-Dance Representations through Explicit-Implicit Rhythm
+  Synchronization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Yu, Junfu Pu, Ying Cheng, Rui Feng, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although audio-visual representation has been proved to be applicable in many
+downstream tasks, the representation of dancing videos, which is more specific
+and always accompanied by music with complex auditory contents, remains
+challenging and uninvestigated. Considering the intrinsic alignment between the
+cadent movement of dancer and music rhythm, we introduce MuDaR, a novel
+Music-Dance Representation learning framework to perform the synchronization of
+music and dance rhythms both in explicit and implicit ways. Specifically, we
+derive the dance rhythms based on visual appearance and motion cues inspired by
+the music rhythm analysis. Then the visual rhythms are temporally aligned with
+the music counterparts, which are extracted by the amplitude of sound
+intensity. Meanwhile, we exploit the implicit coherence of rhythms implied in
+audio and visual streams by contrastive learning. The model learns the joint
+embedding by predicting the temporal consistency between audio-visual pairs.
+The music-dance representation, together with the capability of detecting audio
+and visual rhythms, can further be applied to three downstream tasks: (a) dance
+classification, (b) music-dance retrieval, and (c) music-dance retargeting.
+Extensive experiments demonstrate that our proposed framework outperforms other
+self-supervised methods by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Multimedia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Closer Look at Audio-Visual Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhong Chen, Yuyuan Liu, Hu Wang, Fengbei Liu, Chong Wang, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual segmentation (AVS) is a complex task that involves accurately
+segmenting the corresponding sounding object based on audio-visual queries.
+Successful audio-visual learning requires two essential components: 1) an
+unbiased dataset with high-quality pixel-level multi-class labels, and 2) a
+model capable of effectively linking audio information with its corresponding
+visual object. However, these two requirements are only partially addressed by
+current methods, with training sets containing biased audio-visual data, and
+models that generalise poorly beyond this biased training set. In this work, we
+propose a new strategy to build cost-effective and relatively unbiased
+audio-visual semantic segmentation benchmarks. Our strategy, called Visual
+Post-production (VPO), explores the observation that it is not necessary to
+have explicit audio-visual pairs extracted from single video sources to build
+such benchmarks. We also refine the previously proposed AVSBench to transform
+it into the audio-visual semantic segmentation benchmark AVSBench-Single+.
+Furthermore, this paper introduces a new pixel-wise audio-visual contrastive
+learning method to enable a better generalisation of the model beyond the
+training set. We verify the validity of the VPO strategy by showing that
+state-of-the-art (SOTA) models trained with datasets built by matching audio
+and visual data from different sources or with datasets containing audio and
+visual data from the same video source produce almost the same accuracy. Then,
+using the proposed VPO benchmarks and AVSBench-Single+, we show that our method
+produces more accurate audio-visual semantic segmentation than SOTA models.
+Code and dataset will be available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-09T00:00:00Z">2023-08-09</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">42</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic
+  Role Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhao, Hao Fei, Yixin Cao, Bobo Li, Meishan Zhang, Jianguo Wei, Min Zhang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from
+given videos, by recognizing the predict-argument event structures and the
+interrelationships between events. While recent endeavors have put forth
+methods for VidSRL, they can be mostly subject to two key drawbacks, including
+the lack of fine-grained spatial scene perception and the insufficiently
+modeling of video temporality. Towards this end, this work explores a novel
+holistic spatio-temporal scene graph (namely HostSG) representation based on
+the existing dynamic scene graph structures, which well model both the
+fine-grained spatial semantics and temporal dynamics of videos for VidSRL.
+Built upon the HostSG, we present a nichetargeting VidSRL framework. A
+scene-event mapping mechanism is first designed to bridge the gap between the
+underlying scene structure and the high-level event semantic structure,
+resulting in an overall hierarchical scene-event (termed ICE) graph structure.
+We further perform iterative structure refinement to optimize the ICE graph,
+such that the overall structure representation can best coincide with end task
+demand. Finally, three subtask predictions of VidSRL are jointly decoded, where
+the end-to-end paradigm effectively avoids error propagation. On the benchmark
+dataset, our framework boosts significantly over the current best-performing
+model. Further analyses are shown for a better understanding of the advances of
+our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadGraph2: Modeling Disease Progression in Radiology Reports via
+  Hierarchical Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameer Khanna, Adam Dejl, Kibo Yoon, Quoc Hung Truong, Hanh Duong, Agustina Saenz, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RadGraph2, a novel dataset for extracting information from
+radiology reports that focuses on capturing changes in disease state and device
+placement over time. We introduce a hierarchical schema that organizes entities
+based on their relationships and show that using this hierarchy during training
+improves the performance of an information extraction model. Specifically, we
+propose a modification to the DyGIE++ framework, resulting in our model HGIE,
+which outperforms previous models in entity and relation extraction tasks. We
+demonstrate that RadGraph2 enables models to capture a wider variety of
+findings and perform better at relation extraction compared to those trained on
+the original RadGraph dataset. Our work provides the foundation for developing
+automated systems that can track disease progression over time and develop
+information extraction models that leverage the natural hierarchy of labels in
+the medical domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Machine Learning for Healthcare 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingdan Zhang, Jiaan Wang, Xiaodan Wang, Zhixu Li, Yanghua Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal knowledge graphs (MMKGs) combine different modal data (e.g., text
+and image) for a comprehensive understanding of entities. Despite the recent
+progress of large-scale MMKGs, existing MMKGs neglect the multi-aspect nature
+of entities, limiting the ability to comprehend entities from various
+perspectives. In this paper, we construct AspectMMKG, the first MMKG with
+aspect-related images by matching images to different entity aspects.
+Specifically, we collect aspect-related images from a knowledge base, and
+further extract aspect-related sentences from the knowledge base as queries to
+retrieve a large number of aspect-related images via an online image search
+engine. Finally, AspectMMKG contains 2,380 entities, 18,139 entity aspects, and
+645,383 aspect-related images. We demonstrate the usability of AspectMMKG in
+entity aspect linking (EAL) downstream task and show that previous EAL models
+achieve a new state-of-the-art performance with the help of AspectMMKG. To
+facilitate the research on aspect-related MMKG, we further propose an
+aspect-related image retrieval (AIR) model, that aims to correct and expand
+aspect-related images in AspectMMKG. We train an AIR model to learn the
+relationship between entity image and entity aspect-related images by
+incorporating entity image, aspect, and aspect image information. Experimental
+results indicate that the AIR model could retrieve suitable images for a given
+entity w.r.t different aspects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Multilingual Text Data Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Sahni, Harsh Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of deep learning, large datasets and complex models have become
+common, requiring significant computing power. To address this, data
+distillation has emerged as a technique to quickly train models with lower
+memory and time requirements. However, data distillation on text-based datasets
+hasn't been explored much because of the challenges rising due to its discrete
+nature. Additionally, existing dataset distillation methods often struggle to
+generalize to new architectures. In the paper, we propose several data
+distillation techniques for multilingual text classification datasets using
+language-model-based learning methods. We conduct experiments to analyze their
+performance in terms of classification strength, and cross-architecture
+generalization. Furthermore, we investigate the language-specific fairness of
+the data summaries generated by these methods. Our approach builds upon
+existing techniques, enhancing cross-architecture generalization in the text
+data distillation domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Analysis of <span class="highlight-title">Transformer</span> Based Models (<span class="highlight-title">BERT</span>, AL<span class="highlight-title">BERT</span> and
+  Ro<span class="highlight-title">BERT</span>a) in Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafna Fitria Nur Azizah, Hasan Dwi Cahyono, Sari Widya Sihwi, Wisnu Widiarto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fake news is fake material in a news media format but is not processed
+properly by news agencies. The fake material can provoke or defame significant
+entities or individuals or potentially even for the personal interests of the
+creators, causing problems for society. Distinguishing fake news and real news
+is challenging due to limited of domain knowledge and time constraints.
+According to the survey, the top three areas most exposed to hoaxes and
+misinformation by residents are in Banten, DKI Jakarta and West Java. The model
+of transformers is referring to an approach in the field of artificial
+intelligence (AI) in natural language processing utilizing the deep learning
+architectures. Transformers exercise a powerful attention mechanism to process
+text in parallel and produce rich and contextual word representations. A
+previous study indicates a superior performance of a transformer model known as
+BERT over and above non transformer approach. However, some studies suggest the
+performance can be improved with the use of improved BERT models known as
+ALBERT and RoBERTa. However, the modified BERT models are not well explored for
+detecting fake news in Bahasa Indonesia. In this research, we explore those
+transformer models and found that ALBERT outperformed other models with 87.6%
+accuracy, 86.9% precision, 86.9% F1-score, and 174.5 run-time (s/epoch)
+respectively. Source code available at:
+https://github.com/Shafna81/fakenewsdetection.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extrapolating Large Language Models to Non-English by Aligning Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Zhu, Yunzhe Lv, Qingxiu Dong, Fei Yuan, Jingjing Xu, Shujian Huang, Lingpeng Kong, Jiajun Chen, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the unbalanced training data distribution, the language ability of
+large language models (LLMs) is often biased towards English. In this paper, we
+propose to empower pre-trained LLMs on non-English languages by building
+semantic alignment across languages. We perform instruction-tuning on LLaMA
+with both translation task data and cross-lingual general task data to obtain
+cross-lingual models (x-LLaMA). Experiment results on cross-lingual benchmark
+XQUAD and MLQA show that x-LLaMA models outperform the English
+instruction-tuned counterpart (Alpaca) by 42.50% on average on six non-English
+languages. Further experiments on Chinese benchmark C-Eval show that x-LLaMA
+achieves significant improvement on Chinese humanities tasks, outperforming
+Alpaca by 8.2%. We also discover that incorporating non-English text on the
+target side of translation data is particularly effective for boosting
+non-English ability. Besides, we find that semantic alignment within LLM can be
+further strengthened as translation task data scales up and we present the
+formulation of the underlying scaling law. Evaluation results on translation
+dataset Flores-101 show that \method outperforms previous LLaMA-based models in
+all evaluated directions. Code and data will be available at:
+https://github.com/OwenNJU/x-LLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMeBench: A Flexible Framework for Accelerating LLMs Benchmarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahim Dalvi, Maram Hasanain, Sabri Boughorbel, Basel Mousi, Samir Abdaljalil, Nizi Nazar, Ahmed Abdelali, Shammur Absar Chowdhury, Hamdy Mubarak, Ahmed Ali, Majd Hawasly, Nadir Durrani, Firoj Alam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent development and success of Large Language Models (LLMs)
+necessitate an evaluation of their performance across diverse NLP tasks in
+different languages. Although several frameworks have been developed and made
+publicly available, their customization capabilities for specific tasks and
+datasets are often complex for different users. In this study, we introduce the
+LLMeBench framework. Initially developed to evaluate Arabic NLP tasks using
+OpenAI's GPT and BLOOM models; it can be seamlessly customized for any NLP task
+and model, regardless of language. The framework also features zero- and
+few-shot learning settings. A new custom dataset can be added in less than 10
+minutes, and users can use their own model API keys to evaluate the task at
+hand. The developed framework has been already tested on 31 unique NLP tasks
+using 53 publicly available datasets within 90 experimental setups, involving
+approximately 296K data points. We plan to open-source the framework for the
+community (https://github.com/qcri/LLMeBench/). A video demonstrating the
+framework is available online (https://youtu.be/FkQn4UjYA0s).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Foundation Models, Large Language Models, NLP, CHatGPT Evaluation,
+  LLMs Benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating large language models and active inference to understand eye
+  movements in reading and dyslexia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Donnarumma, Mirco Frosolone, Giovanni Pezzulo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel computational model employing hierarchical active
+inference to simulate reading and eye movements. The model characterizes
+linguistic processing as inference over a hierarchical generative model,
+facilitating predictions and inferences at various levels of granularity, from
+syllables to sentences.
+  Our approach combines the strengths of large language models for realistic
+textual predictions and active inference for guiding eye movements to
+informative textual information, enabling the testing of predictions. The model
+exhibits proficiency in reading both known and unknown words and sentences,
+adhering to the distinction between lexical and nonlexical routes in dual-route
+theories of reading. Notably, our model permits the exploration of maladaptive
+inference effects on eye movements during reading, such as in dyslexia. To
+simulate this condition, we attenuate the contribution of priors during the
+reading process, leading to incorrect inferences and a more fragmented reading
+style, characterized by a greater number of shorter saccades. This alignment
+with empirical findings regarding eye movements in dyslexic individuals
+highlights the model's potential to aid in understanding the cognitive
+processes underlying reading and eye movements, as well as how reading deficits
+associated with dyslexia may emerge from maladaptive predictive processing.
+  In summary, our model represents a significant advancement in comprehending
+the intricate cognitive processes involved in reading and eye movements, with
+potential implications for understanding and addressing dyslexia through the
+simulation of maladaptive inference. It may offer valuable insights into this
+condition and contribute to the development of more effective interventions for
+treatment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 1 Appendix, 11 Tables, 9 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaMA-E: Empowering E-commerce Authoring with Multi-Aspect Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaize Shi, Xueyao Sun, Dingxian Wang, Yinlin Fu, Guandong Xu, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  E-commerce authoring involves creating attractive, abundant, and targeted
+promotional content to drive product sales. The emergence of large language
+models (LLMs) introduces an innovative paradigm, offering a unified solution to
+address various authoring tasks within this scenario. However, mainstream LLMs
+trained on general corpora with common sense knowledge reveal limitations in
+fitting complex and personalized features unique to e-commerce products and
+customers. Furthermore, LLMs like GPT-3.5 necessitate remote accessibility,
+raising concerns about safeguarding voluminous customer privacy data during
+transmission. This paper proposes the LLaMA-E, the unified and customized
+instruction-following language models focusing on diverse e-commerce authoring
+tasks. Specifically, the domain experts create the seed instruction set from
+the tasks of ads generation, query-enhanced product title rewriting, product
+classification, purchase intent speculation, and general Q&A. These tasks
+enable the models to comprehensively understand precise e-commerce authoring
+knowledge by interleaving features covering typical service aspects of
+customers, sellers, and platforms. The GPT-3.5 is introduced as a teacher
+model, which expands the seed instructions to form a training set for the
+LLaMA-E models with various scales. The experimental results show that the
+proposed LLaMA-E models achieve state-of-the-art results in quantitative and
+qualitative evaluations, also exhibiting the advantage in zero-shot scenes. To
+the best of our knowledge, this study is the first to serve the LLMs to
+specific e-commerce authoring scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Out-of-Distribution Dialect Detection with Mahalanobis
+  Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourya Dipta Das, Yash Vadi, Abhishek Unnam, Kuldeep Yadav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialect classification is used in a variety of applications, such as machine
+translation and speech recognition, to improve the overall performance of the
+system. In a real-world scenario, a deployed dialect classification model can
+encounter anomalous inputs that differ from the training data distribution,
+also called out-of-distribution (OOD) samples. Those OOD samples can lead to
+unexpected outputs, as dialects of those samples are unseen during model
+training. Out-of-distribution detection is a new research area that has
+received little attention in the context of dialect classification. Towards
+this, we proposed a simple yet effective unsupervised Mahalanobis distance
+feature-based method to detect out-of-distribution samples. We utilize the
+latent embeddings from all intermediate layers of a wav2vec 2.0
+transformer-based dialect classifier model for multi-task learning. Our
+proposed approach outperforms other state-of-the-art OOD detection methods
+significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-Theoretic Characterization of Vowel Harmony: A
+  Cross-Linguistic Study on Word Lists <span class="chip">EACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Steuer, Badr Abdullah, Johann-Mattis List, Dietrich Klakow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a cross-linguistic study that aims to quantify vowel harmony using
+data-driven computational modeling. Concretely, we define an
+information-theoretic measure of harmonicity based on the predictability of
+vowels in a natural language lexicon, which we estimate using phoneme-level
+language models (PLMs). Prior quantitative studies have relied heavily on
+inflected word-forms in the analysis of vowel harmony. We instead train our
+models using cross-linguistically comparable lemma forms with little or no
+inflection, which enables us to cover more under-studied languages. Training
+data for our PLMs consists of word lists with a maximum of 1000 entries per
+language. Despite the fact that the data we employ are substantially smaller
+than previously used corpora, our experiments demonstrate the neural PLMs
+capture vowel harmony patterns in a set of languages that exhibit this
+phenomenon. Our work also demonstrates that word lists are a valuable resource
+for typological research, and offers new possibilities for future studies on
+low-resource, under-studied languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at SIGTYP at EACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotion-Conditioned Text Generation through Automatic <span class="highlight-title">Prompt</span>
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yarik Menchaca Resendiz, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional natural language generation methods often require either
+expensive fine-tuning or training a large language model from scratch. Both are
+unlikely to lead to good results without a substantial amount of data and
+computational resources. Prompt learning without changing the parameters of a
+large language model presents a promising alternative. It is a cost-effective
+approach, while still achieving competitive results. While this procedure is
+now established for zero- and few-shot text classification and structured
+prediction, it has received limited attention in conditional text generation.
+We present the first automatic prompt optimization approach for
+emotion-conditioned text generation with instruction-fine-tuned models. Our
+method uses an iterative optimization procedure that changes the prompt by
+adding, removing, or replacing tokens. As objective function, we only require a
+text classifier that measures the realization of the conditional variable in
+the generated text. We evaluate the method on emotion-conditioned text
+generation with a focus on event reports and compare it to manually designed
+prompts that also act as the seed for the optimization procedure. The optimized
+prompts achieve 0.75 macro-average F1 to fulfill the emotion condition in
+contrast to manually designed seed prompts with only 0.22 macro-average F1.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSSR: A Truncated and Signed Square Root Activation Function for Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation functions are essential components of neural networks. In this
+paper, we introduce a new activation function called the Truncated and Signed
+Square Root (TSSR) function. This function is distinctive because it is odd,
+nonlinear, monotone and differentiable. Its gradient is continuous and always
+positive. Thanks to these properties, it has the potential to improve the
+numerical stability of neural networks. Several experiments confirm that the
+proposed TSSR has better performance than other stat-of-the-art activation
+functions. The proposed function has significant implications for the
+development of neural network models and can be applied to a wide range of
+applications in fields such as computer vision, natural language processing,
+and speech recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2307.16389</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Generation Capabilities of Large Chinese Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Zeng, Jingyuan Xue, Meng Hao, Chen Sun, Bin Ning, Na Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CG-Eval, the first comprehensive evaluation of the
+generation capabilities of large Chinese language models across a wide range of
+academic disciplines. The models' performance was assessed based on their
+ability to generate accurate and relevant responses to different types of
+questions in six disciplines, namely, Science and Engineering, Humanities and
+Social Sciences, Mathematical Calculations, Medical Practitioner Qualification
+Examination, Judicial Examination, and Certified Public Accountant Examination.
+This paper also presents Gscore, a composite index derived from the weighted
+sum of multiple metrics to measure the quality of model's generation against a
+reference. The test data and test results can be found at
+http://cgeval.besteasy.com/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLEVA: Chinese Language Models EVAluation Platform 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyang Li, Jianqiao Zhao, Duo Zheng, Zi-Yuan Hu, Zhi Chen, Xiaohui Su, Yongfeng Huang, Shijia Huang, Dahua Lin, Michael R. Lyu, Liwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous emergence of Chinese Large Language Models (LLMs), how to
+evaluate a model's capabilities has become an increasingly significant issue.
+The absence of a comprehensive Chinese benchmark that thoroughly assesses a
+model's performance, the unstandardized and incomparable prompting procedure,
+and the prevalent risk of contamination pose major challenges in the current
+evaluation of Chinese LLMs. We present CLEVA, a user-friendly platform crafted
+to holistically evaluate Chinese LLMs. Our platform employs a standardized
+workflow to assess LLMs' performance across various dimensions, regularly
+updating a competitive leaderboard. To alleviate contamination, CLEVA curates a
+significant proportion of new data and develops a sampling strategy that
+guarantees a unique subset for each leaderboard round. Empowered by an
+easy-to-use interface that requires just a few mouse clicks and a model API,
+users can conduct a thorough evaluation with minimal coding. Large-scale
+experiments featuring 23 influential Chinese LLMs have validated CLEVA's
+efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bipartite Graph is All We Need for Enhancing Emotional Reasoning with
+  Commonsense Knowledge <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailai Yang, Tianlin Zhang, Shaoxiong Ji, Sophia Ananiadou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The context-aware emotional reasoning ability of AI systems, especially in
+conversations, is of vital importance in applications such as online opinion
+mining from social media and empathetic dialogue systems. Due to the implicit
+nature of conveying emotions in many scenarios, commonsense knowledge is widely
+utilized to enrich utterance semantics and enhance conversation modeling.
+However, most previous knowledge infusion methods perform empirical knowledge
+filtering and design highly customized architectures for knowledge interaction
+with the utterances, which can discard useful knowledge aspects and limit their
+generalizability to different knowledge sources. Based on these observations,
+we propose a Bipartite Heterogeneous Graph (BHG) method for enhancing emotional
+reasoning with commonsense knowledge. In BHG, the extracted context-aware
+utterance representations and knowledge representations are modeled as
+heterogeneous nodes. Two more knowledge aggregation node types are proposed to
+perform automatic knowledge filtering and interaction. BHG-based knowledge
+infusion can be directly generalized to multi-type and multi-grained knowledge
+sources. In addition, we propose a Multi-dimensional Heterogeneous Graph
+Transformer (MHGT) to perform graph reasoning, which can retain unchanged
+feature spaces and unequal dimensions for heterogeneous node types during
+inference to prevent unnecessary loss of information. Experiments show that
+BHG-based methods significantly outperform state-of-the-art knowledge infusion
+methods and show generalized knowledge infusion ability with higher efficiency.
+Further analysis proves that previous empirical knowledge filtering methods do
+not guarantee to provide the most useful knowledge information. Our code is
+available at: https://github.com/SteveKGYang/BHG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023 as a long paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADMUS: A Progressive Question Answering Framework Adaptable to Multiple
+  Knowledge Sources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yirui Zhan, Yanzeng Li, Minhao Zhang, Lei Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the introduction of deep learning models, semantic parsingbased
+knowledge base question answering (KBQA) systems have achieved high performance
+in handling complex questions. However, most existing approaches primarily
+focus on enhancing the model's effectiveness on individual benchmark datasets,
+disregarding the high costs of adapting the system to disparate datasets in
+real-world scenarios (e.g., multi-tenant platform). Therefore, we present
+ADMUS, a progressive knowledge base question answering framework designed to
+accommodate a wide variety of datasets, including multiple languages, diverse
+backbone knowledge bases, and disparate question answering datasets. To
+accomplish the purpose, we decouple the architecture of conventional KBQA
+systems and propose this dataset-independent framework. Our framework supports
+the seamless integration of new datasets with minimal effort, only requiring
+creating a dataset-related micro-service at a negligible cost. To enhance the
+usability of ADUMS, we design a progressive framework consisting of three
+stages, ranges from executing exact queries, generating approximate queries and
+retrieving open-domain knowledge referring from large language models. An
+online demonstration of ADUMS is available at:
+https://answer.gstore.cn/pc/index.html
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatically measuring speech fluency in people with aphasia: first
+  achievements using read-speech data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lionel Fontan, Typhanie Prince, Aleksandra Nowakowska, Halima Sahraoui, Silvia Martinez-Ferreiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Speech and language pathologists (SLPs) often relyon judgements
+of speech fluency for diagnosing or monitoringpatients with aphasia. However,
+such subjective methods havebeen criticised for their lack of reliability and
+their clinical cost interms of time. Aims: This study aims at assessing the
+relevance of a signalprocessingalgorithm, initially developed in the field of
+language acquisition, for the automatic measurement of speech fluency in people
+with aphasia (PWA). Methods & Procedures: Twenty-nine PWA and five control
+participantswere recruited via non-profit organizations and SLP networks. All
+participants were recorded while reading out loud a set ofsentences taken from
+the French version of the Boston Diagnostic Aphasia Examination. Three trained
+SLPs assessed the fluency of each sentence on a five-point qualitative scale. A
+forward-backward divergence segmentation and a clustering algorithm were used
+to compute, for each sentence, four automatic predictors of speech fluency:
+pseudo-syllable rate, speech ratio, rate of silent breaks, and standard
+deviation of pseudo-syllable length. The four predictors were finally combined
+into multivariate regression models (a multiplelinear regression - MLR, and two
+non-linear models) to predict the average SLP ratings of speech fluency, using
+a leave-one speaker-out validation scheme. Outcomes & Results: All models
+achieved accurate predictions of speech fluency ratings, with average
+root-mean-square errors as low as 0.5. The MLR yielded a correlation
+coefficient of 0.87 with reference ratings at the sentence level, and of 0.93
+when aggregating the data for each participant. The inclusion of an additional
+predictor sensitive to repetitions improved further the predictions with a
+correlation coefficient of 0.91 at the sentence level, and of 0.96 at the
+participant level. Conclusions: The algorithms used in this study can
+constitute a cost-effective and reliable tool for the assessment of the speech
+fluency of patients with aphasia in read-aloud tasks. Perspectives for the
+assessment of spontaneous speech are discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Aphasiology, (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Interpretable and Reliable Open Information Retriever for New
+  Domains Overnight <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodong Yu, Ben Zhou, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information retrieval (IR) or knowledge retrieval, is a critical component
+for many down-stream tasks such as open-domain question answering (QA). It is
+also very challenging, as it requires succinctness, completeness, and
+correctness. In recent works, dense retrieval models have achieved
+state-of-the-art (SOTA) performance on in-domain IR and QA benchmarks by
+representing queries and knowledge passages with dense vectors and learning the
+lexical and semantic similarity. However, using single dense vectors and
+end-to-end supervision are not always optimal because queries may require
+attention to multiple aspects and event implicit knowledge. In this work, we
+propose an information retrieval pipeline that uses entity/event linking model
+and query decomposition model to focus more accurately on different information
+units of the query. We show that, while being more interpretable and reliable,
+our proposed pipeline significantly improves passage coverages and denotation
+accuracies across five IR and QA benchmarks. It will be the go-to system to use
+for applications that need to perform IR on a new domain without much dedicated
+effort, because of its superior interpretability and cross-domain performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submission of ACL 2023. Rejected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Slot Induction via <span class="highlight-title">Pre-train</span>ed Language Model Probing and Multi-level
+  Contrastive Learning <span class="chip">SIGDIAL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang H. Nguyen, Chenwei Zhang, Ye Liu, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advanced methods in Natural Language Understanding for Task-oriented
+Dialogue (TOD) Systems (e.g., intent detection and slot filling) require a
+large amount of annotated data to achieve competitive performance. In reality,
+token-level annotations (slot labels) are time-consuming and difficult to
+acquire. In this work, we study the Slot Induction (SI) task whose objective is
+to induce slot boundaries without explicit knowledge of token-level slot
+annotations. We propose leveraging Unsupervised Pre-trained Language Model
+(PLM) Probing and Contrastive Learning mechanism to exploit (1) unsupervised
+semantic knowledge extracted from PLM, and (2) additional sentence-level intent
+label signals available from TOD. Our approach is shown to be effective in SI
+task and capable of bridging the gaps with token-level supervised models on two
+NLU benchmark datasets. When generalized to emerging intents, our SI objectives
+also provide enhanced slot label representations, leading to improved
+performance on the Slot Filling tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGDIAL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Answering Unseen Questions With Smaller Language\\Models Using Rationale
+  Generation and Dense Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Hartill, Diana Benavides-Prado, Michael Witbrock, Patricia J. Riddle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When provided with sufficient explanatory context, smaller Language Models
+have been shown to exhibit strong reasoning ability on challenging short-answer
+question-answering tasks where the questions are unseen in training. We
+evaluate two methods for further improvement in this setting. Both methods
+focus on combining rationales generated by a larger Language Model with longer
+contexts created from a multi-hop dense retrieval system. The first method
+($\textit{RR}$) involves training a Rationale Ranking model to score both
+generated rationales and retrieved contexts with respect to relevance and
+truthfulness. We then use the scores to derive combined contexts from both
+knowledge sources using a number of combinatory strategies. For the second
+method ($\textit{RATD}$) we train a smaller Reasoning model using
+retrieval-augmented training datasets such that it becomes proficient at
+utilising relevant information from longer text sequences that may be only
+partially evidential and frequently contain many irrelevant sentences.
+Generally we find that both methods are effective but that the $\textit{RATD}$
+method is more straightforward to apply and produces the strongest results in
+the unseen setting on which we focus. Our single best Reasoning model using
+only 440 million parameters materially improves upon strong comparable prior
+baselines for unseen evaluation datasets (StrategyQA 58.9 $\rightarrow$ 61.7
+acc., CommonsenseQA 63.6 $\rightarrow$ 72.7 acc., ARC-DA 31.6 $\rightarrow$
+52.1 F1, IIRC 25.5 $\rightarrow$ 27.3 F1) and a version utilising our prior
+knowledge of each type of question in selecting a context combination strategy
+does even better. Our proposed models also generally outperform direct prompts
+against much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot
+chain-of-thought and few-shot answer-only settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Open-Source Large Language Models, <span class="highlight-title">GPT</span>-4 and
+  Claude 2: Multiple-Choice Test Taking in Nephrology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Wu, Michael Koo, Lesley Blum, Andy Black, Liyo Kao, Fabien Scalzo, Ira Kurtz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there have been significant breakthroughs in the field of
+natural language processing, particularly with the development of large
+language models (LLMs). These LLMs have showcased remarkable capabilities on
+various benchmarks. In the healthcare field, the exact role LLMs and other
+future AI models will play remains unclear. There is a potential for these
+models in the future to be used as part of adaptive physician training, medical
+co-pilot applications, and digital patient interaction scenarios. The ability
+of AI models to participate in medical training and patient care will depend in
+part on their mastery of the knowledge content of specific medical fields. This
+study investigated the medical knowledge capability of LLMs, specifically in
+the context of internal medicine subspecialty multiple-choice test-taking
+ability. We compared the performance of several open-source LLMs (Koala 7B,
+Falcon 7B, Stable-Vicuna 13B, and Orca Mini 13B), to GPT-4 and Claude 2 on
+multiple-choice questions in the field of Nephrology. Nephrology was chosen as
+an example of a particularly conceptually complex subspecialty field within
+internal medicine. The study was conducted to evaluate the ability of LLM
+models to provide correct answers to nephSAP (Nephrology Self-Assessment
+Program) multiple-choice questions. The overall success of open-sourced LLMs in
+answering the 858 nephSAP multiple-choice questions correctly was 17.1% -
+25.5%. In contrast, Claude 2 answered 54.4% of the questions correctly, whereas
+GPT-4 achieved a score of 73.3%. We show that current widely used open-sourced
+LLMs do poorly in their ability for zero-shot reasoning when compared to GPT-4
+and Claude 2. The findings of this study potentially have significant
+implications for the future of subspecialty medical training and patient care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating News-Centric Crossword Puzzles As A Constraint Satisfaction
+  and Optimization Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaito Majima, Shotaro Ishihara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crossword puzzles have traditionally served not only as entertainment but
+also as an educational tool that can be used to acquire vocabulary and language
+proficiency. One strategy to enhance the educational purpose is
+personalization, such as including more words on a particular topic. This paper
+focuses on the case of encouraging people's interest in news and proposes a
+framework for automatically generating news-centric crossword puzzles. We
+designed possible scenarios and built a prototype as a constraint satisfaction
+and optimization problem, that is, containing as many news-derived words as
+possible. Our experiments reported the generation probabilities and time
+required under several conditions. The results showed that news-centric
+crossword puzzles can be generated even with few news-derived words. We
+summarize the current issues and future research directions through a
+qualitative evaluation of the prototype. This is the first proposal that a
+formulation of a constraint satisfaction and optimization problem can be
+beneficial as an educational application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32nd ACM International Conference on Information and Knowledge
+  Management (short paper track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sci-CoT: Leveraging Large Language Models for Enhanced Knowledge
+  Distillation in Small Models for Scientific QA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Ma, Haiqi Jiang, Chenyou Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown outstanding performance across wide
+range of downstream tasks. This competency is attributed to their substantial
+parameter size and pre-training on extensive corpus. Moreover, LLMs have
+exhibited enhanced reasoning capabilities in tackling complex reasoning tasks,
+owing to the utilization of a method named ``Chain-of-Thought (CoT)
+prompting''. This method is designed to generate intermediate reasoning steps
+that guide the inference of the final answer. However, it is essential to
+highlight that these advanced reasoning abilities appear to emerge in models
+with a minimum of 10 billion parameters, thereby limiting its efficacy in
+situations where computational resources are constrained. In this paper, we
+investigate the possibility of transferring the reasoning capabilities of LLMs
+to smaller models via knowledge distillation. Specifically, we propose Sci-CoT,
+a two-stage framework that separates the processes of generating rationales and
+inferring answers. This method enables a more efficient use of rationales
+during the answer inference stage, leading to improved performance on
+scientific question-answering tasks. Utilizing Sci-CoT, our 80-million
+parameter model is able to exceed the performance of BLOOM-176B in the ARC-Easy
+dataset under the few shot setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sudowoodo: a Chinese Lyric Imitation System with Source Lyrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongzhu Chang, Rongsheng Zhang, Lin Jiang, Qihang Chen, Le Zhang, Jiashu Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lyrics generation is a well-known application in natural language generation
+research, with several previous studies focusing on generating accurate lyrics
+using precise control such as keywords, rhymes, etc. However, lyrics imitation,
+which involves writing new lyrics by imitating the style and content of the
+source lyrics, remains a challenging task due to the lack of a parallel corpus.
+In this paper, we introduce \textbf{\textit{Sudowoodo}}, a Chinese lyrics
+imitation system that can generate new lyrics based on the text of source
+lyrics. To address the issue of lacking a parallel training corpus for lyrics
+imitation, we propose a novel framework to construct a parallel corpus based on
+a keyword-based lyrics model from source lyrics. Then the pairs \textit{(new
+lyrics, source lyrics)} are used to train the lyrics imitation model. During
+the inference process, we utilize a post-processing module to filter and rank
+the generated lyrics, selecting the highest-quality ones. We incorporated audio
+information and aligned the lyrics with the audio to form the songs as a bonus.
+The human evaluation results show that our framework can perform better lyric
+imitation. Meanwhile, the \textit{Sudowoodo} system and demo video of the
+system is available at
+\href{https://Sudowoodo.apps-hp.danlu.netease.com/}{Sudowoodo} and
+\href{https://youtu.be/u5BBT_j1L5M}{https://youtu.be/u5BBT\_j1L5M}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages,3 figures, submit to emnlp 2023 demo track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Lingual Constituency Parsing for Middle High German: A
+  Delexicalized Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ercong Nie, Helmut Schmid, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constituency parsing plays a fundamental role in advancing natural language
+processing (NLP) tasks. However, training an automatic syntactic analysis
+system for ancient languages solely relying on annotated parse data is a
+formidable task due to the inherent challenges in building treebanks for such
+languages. It demands extensive linguistic expertise, leading to a scarcity of
+available resources. To overcome this hurdle, cross-lingual transfer techniques
+which require minimal or even no annotated data for low-resource target
+languages offer a promising solution. In this study, we focus on building a
+constituency parser for $\mathbf{M}$iddle $\mathbf{H}$igh $\mathbf{G}$erman
+$\mathbf{MHG}$ under realistic conditions, where no annotated MHG treebank is
+available for training. In our approach, we leverage the linguistic continuity
+and structural similarity between MHG and $\mathbf{M}$odern $\mathbf{G}$erman
+$\mathbf{MG}$, along with the abundance of MG treebank resources. Specifically,
+by employing the $\mathit{delexicalization}$ method, we train a constituency
+parser on MG parse datasets and perform cross-lingual transfer to MHG parsing.
+Our delexicalized constituency parser demonstrates remarkable performance on
+the MHG test set, achieving an F1-score of 67.3%. It outperforms the best
+zero-shot cross-lingual baseline by a margin of 28.6% points. These encouraging
+results underscore the practicality and potential for automatic syntactic
+analysis in other ancient languages that face similar challenges as MHG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ALP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Layer Saliency in Language <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth M. Hou, Gregory Castanon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a strategy for identifying textual saliency in
+large-scale language models applied to classification tasks. In visual networks
+where saliency is more well-studied, saliency is naturally localized through
+the convolutional layers of the network; however, the same is not true in
+modern transformer-stack networks used to process natural language. We adapt
+gradient-based saliency methods for these networks, propose a method for
+evaluating the degree of semantic coherence of each layer, and demonstrate
+consistent improvement over numerous other methods for textual saliency on
+multiple benchmark classification datasets. Our approach requires no additional
+training or access to labelled data, and is comparatively very computationally
+efficient.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Go Beyond The Obvious: Probing the gap of INFORMAL reasoning ability
+  between Humanity and LLMs by Detective Reasoning Puzzle Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhouhon Gu, Zihan Li, Lin Zhang, Zhuozhi Xiong, Haoning Ye, Yikai Zhang, Wenhao Huang, Xiaoxuan Zhu, Qianyu He, Rui Xu, Sihang Jiang, Shusen Wang, Zili Wang, Hongwei Feng, Zhixu Li, Yanghua Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Informal reasoning ability is the ability to reason based on common sense,
+experience, and intuition.Humans use informal reasoning every day to extract
+the most influential elements for their decision-making from a large amount of
+life-like information.With the rapid development of language models, the
+realization of general artificial intelligence has emerged with hope. Given the
+outstanding informal reasoning ability of humans, how much informal reasoning
+ability language models have has not been well studied by scholars.In order to
+explore the gap between humans and language models in informal reasoning
+ability, this paper constructs a Detective Reasoning Benchmark, which is an
+assembly of 1,200 questions gathered from accessible online resources, aims at
+evaluating the model's informal reasoning ability in real-life
+context.Considering the improvement of the model's informal reasoning ability
+restricted by the lack of benchmark, we further propose a Self-Question Prompt
+Framework that mimics human thinking to enhance the model's informal reasoning
+ability.The goals of self-question are to find key elements, deeply investigate
+the connections between these elements, encourage the relationship between each
+element and the problem, and finally, require the model to reasonably answer
+the problem.The experimental results show that human performance greatly
+outperforms the SoTA Language Models in Detective Reasoning Benchmark.Besides,
+Self-Question is proven to be the most effective prompt engineering in
+improving GPT-4's informal reasoning ability, but it still does not even
+surpass the lowest score made by human participants.Upon acceptance of the
+paper, the source code for the benchmark will be made publicly accessible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid
+  Essay in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Zeng, Lele Sha, Yuheng Li, Kaixun Yang, Dragan Gašević, Guanliang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent large language models (LLMs), e.g., ChatGPT, have been able to
+generate human-like and fluent responses when provided with specific
+instructions. While admitting the convenience brought by technological
+advancement, educators also have concerns that students might leverage LLMs to
+complete their writing assignments and pass them off as their original work.
+Although many AI content detection studies have been conducted as a result of
+such concerns, most of these prior studies modeled AI content detection as a
+classification problem, assuming that a text is either entirely human-written
+or entirely AI-generated. In this study, we investigated AI content detection
+in a rarely explored yet realistic setting where the text to be detected is
+collaboratively written by human and generative LLMs (i.e., hybrid text). We
+first formalized the detection task as identifying the transition points
+between human-written content and AI-generated content from a given hybrid text
+(boundary detection). Then we proposed a two-step approach where we (1)
+separated AI-generated content from human-written content during the encoder
+training process; and (2) calculated the distances between every two adjacent
+prototypes and assumed that the boundaries exist between the two adjacent
+prototypes that have the furthest distance from each other. Through extensive
+experiments, we observed the following main findings: (1) the proposed approach
+consistently outperformed the baseline methods across different experiment
+settings; (2) the encoder training process can significantly boost the
+performance of the proposed approach; (3) when detecting boundaries for
+single-boundary hybrid essays, the proposed approach could be enhanced by
+adopting a relatively large prototype size, leading to a 22% improvement in the
+In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages including references, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited
+  Reference Diversity in NLG Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03131v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03131v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfeng Zeng, Yijin Liu, Fandong Meng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely
+utilized across a range of natural language generation (NLG) tasks. However,
+recent studies have revealed a weak correlation between these matching-based
+metrics and human evaluations, especially when compared with neural-based
+metrics like BLEURT. In this paper, we conjecture that the performance
+bottleneck in matching-based metrics may be caused by the limited diversity of
+references. To address this issue, we propose to utilize \textit{multiple
+references} to enhance the consistency between these metrics and human
+evaluations. Within the WMT Metrics benchmarks, we observe that the
+multi-references F200spBLEU surpasses the conventional single-reference one by
+an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based
+BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the
+data leakage issue in large language models (LLMs) can be mitigated to a large
+extent by our multi-reference metric. We release the code and data at
+\url{https://github.com/SefaZeng/LLM-Ref}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Word Dilution as Text Data Augmentation in Low-Resource
+  Regime <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09287v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09287v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfan Chen, Richong Zhang, Zheyan Luo, Chunming Hu, Yongyi Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is widely used in text classification, especially in the
+low-resource regime where a few examples for each class are available during
+training. Despite the success, generating data augmentations as hard positive
+examples that may increase their effectiveness is under-explored. This paper
+proposes an Adversarial Word Dilution (AWD) method that can generate hard
+positive examples as text data augmentations to train the low-resource text
+classification model efficiently. Our idea of augmenting the text data is to
+dilute the embedding of strong positive words by weighted mixing with
+unknown-word embedding, making the augmented inputs hard to be recognized as
+positive by the classification model. We adversarially learn the dilution
+weights through a constrained min-max optimization process with the guidance of
+the labels. Empirical studies on three benchmark datasets show that AWD can
+generate more effective data augmentations and outperform the state-of-the-art
+text data augmentation methods. The additional analysis demonstrates that the
+data augmentations generated by AWD are interpretable and can flexibly extend
+to new examples without further training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, Accepted by AAAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Harshit Nigam, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Prompt</span>-based Multimodal Tabular <span class="highlight-title">Transformer</span> Encoder For Medical
+  Intervention Duration Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucheng Ruan, Xiang Lan, Daniel J. Tan, Hairil Rizal Abdullah, Mengling Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: This study focuses on estimating the duration of medical
+interventions using electronic health records (EHRs) in clinical decision
+support. Most existing models were designed for structured tabular data only
+and often suffer from data corruption problem. The unstructured clinical
+free-text data that provides valuable insights and is more resistant to data
+corruption is often overlooked. The objective of this research is to develop a
+multimodal deep learning framework that integrates different data modalities
+from EHRs, thereby fully utilizing the predictive capability of EHRs for
+medical intervention estimation.
+  Materials and Methods: A novel prompt-based tabular transformer encoder
+framework is proposed for medical intervention duration estimation based on
+multimodal EHR data. The framework leverages a pre-trained sentence encoder
+with medical prompts to harmonize language representations of various clinical
+data modalities, which a tabular transformer encoder is developed to further
+explore.
+  Results: The developed model demonstrates superior performance compared to
+the baselines in two EHR datasets. Furthermore, the model exhibits resilience
+to data corruption in EHRs, with the RMSE curve increasing gradually with
+higher corruption rates.
+  Discussion: Other than the predictive effectiveness and robustness of the
+proposed framework, the ablation study highlights the significance of critical
+components, such as medical prompts, free-text information, and the pre-trained
+sentence encoder, all contributing to the model's predictive ability.
+  Conclusion: This research presents a promising pathway to enhance medical
+intervention estimation by incorporating diverse data modalities from language
+perspective, ultimately bolstering the reliability of deep learning models in
+clinical care.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne, Luciana Ferrer, Matías Vera, Pablo Piantanida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AttentionViz: A Global View of <span class="highlight-title">Transformer</span> Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Yeh, Yida Chen, Aoyu Wu, Cynthia Chen, Fernanda Viégas, Martin Wattenberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are revolutionizing machine learning, but their inner
+workings remain mysterious. In this work, we present a new visualization
+technique designed to help researchers understand the self-attention mechanism
+in transformers that allows these models to learn rich, contextual
+relationships between elements of a sequence. The main idea behind our method
+is to visualize a joint embedding of the query and key vectors used by
+transformer models to compute attention. Unlike previous attention
+visualization techniques, our approach enables the analysis of global patterns
+across multiple input sequences. We create an interactive visualization tool,
+AttentionViz (demo: http://attentionviz.com), based on these joint query-key
+embeddings, and use it to study attention mechanisms in both language and
+vision transformers. We demonstrate the utility of our approach in improving
+model understanding and offering new insights about query-key interactions
+through several application scenarios and expert feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16839v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16839v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weicheng Kuo, AJ Piergiovanni, Dahun Kim, Xiyang Luo, Ben Caine, Wei Li, Abhijit Ogale, Luowei Zhou, Andrew Dai, Zhifeng Chen, Claire Cui, Anelia Angelova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of language models have moved from encoder-decoder to
+decoder-only designs. In addition, we observe that the two most popular
+multimodal tasks, the generative and contrastive tasks, are nontrivial to
+accommodate in one architecture, and further need adaptations for downstream
+tasks. We propose a novel paradigm of training with a decoder-only model for
+multimodal tasks, which is surprisingly effective in jointly learning of these
+disparate vision-language tasks. This is done with a simple model, called
+MaMMUT. It consists of a single vision encoder and a text decoder, and is able
+to accommodate contrastive and generative learning by a novel two-pass approach
+on the text decoder. We demonstrate that joint learning of these diverse
+objectives is simple, effective, and maximizes the weight-sharing of the model
+across these tasks. Furthermore, the same architecture enables straightforward
+extensions to open-vocabulary object detection and video-language tasks. The
+model tackles a diverse range of tasks, while being modest in capacity. Our
+model achieves the state of the art on image-text and text-image retrieval,
+video question answering and open-vocabulary detection tasks, outperforming
+much larger and more extensively trained foundational models. It shows very
+competitive results on VQA and Video Captioning, especially considering its
+capacity. Ablations confirm the flexibility and advantages of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (
+  https://jmlr.org/tmlr/ ). 18 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowing-how & Knowing-that: A New Task for Machine Comprehension of User
+  Manuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04187v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04187v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongru Liang, Jia Liu, Weihong Du, Dingnan Jin, Wenqiang Lei, Zujie Wen, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The machine reading comprehension (MRC) of user manuals has huge potential in
+customer service. However, current methods have trouble answering complex
+questions. Therefore, we introduce the Knowing-how & Knowing-that task that
+requires the model to answer factoid-style, procedure-style, and inconsistent
+questions about user manuals. We resolve this task by jointly representing the
+steps and facts in a graph TARA, which supports a unified inference of various
+questions. Towards a systematical benchmarking study, we design a heuristic
+method to automatically parse user manuals into TARAs and build an annotated
+dataset to test the model's ability in answering real-world questions.
+Empirical results demonstrate that representing user manuals as TARAs is a
+desired solution for the MRC of user manuals. An in-depth investigation of TARA
+further sheds light on the issues and broader impacts of future representations
+of user manuals. We hope our work can move the MRC of user manuals to a more
+complex and realistic stage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Novel Site-Agnostic Multimodal Deep Learning Model to Identify
+  Pro-Eating Disorder Content on Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Feldman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last decade, there has been a vast increase in eating disorder
+diagnoses and eating disorder-attributed deaths, reaching their zenith during
+the Covid-19 pandemic. This immense growth derived in part from the stressors
+of the pandemic but also from increased exposure to social media, which is rife
+with content that promotes eating disorders. This study aimed to create a
+multimodal deep learning model that can determine if a given social media post
+promotes eating disorders based on a combination of visual and textual data. A
+labeled dataset of Tweets was collected from Twitter, upon which twelve deep
+learning models were trained and tested. Based on model performance, the most
+effective deep learning model was the multimodal fusion of the RoBERTa natural
+language processing model and the MaxViT image classification model, attaining
+accuracy and F1 scores of 95.9% and 0.959, respectively. The RoBERTa and MaxViT
+fusion model, deployed to classify an unlabeled dataset of posts from the
+social media sites Tumblr and Reddit, generated results akin to those of
+previous research studies that did not employ artificial intelligence-based
+techniques, indicating that deep learning models can develop insights congruent
+to those of researchers. Additionally, the model was used to conduct a
+timeseries analysis of yet unseen Tweets from eight Twitter hashtags,
+uncovering that, since 2014, the relative abundance of content that promotes
+eating disorders has decreased drastically within those communities. Despite
+this reduction, by 2018, content that promotes eating disorders had either
+stopped declining or increased in ampleness anew on these hashtags.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automaton-Based Representations of Task Knowledge from Generative
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01944v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01944v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhao Yang, Jean-Raphaël Gaglione, Cyrus Neary, Ufuk Topcu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automaton-based representations of task knowledge play an important role in
+control and planning for sequential decision-making problems. However,
+obtaining the high-level task knowledge required to build such automata is
+often difficult. Meanwhile, large-scale generative language models (GLMs) can
+automatically generate relevant task knowledge. However, the textual outputs
+from GLMs cannot be formally verified or used for sequential decision-making.
+We propose a novel algorithm named GLM2FSA, which constructs a finite state
+automaton (FSA) encoding high-level task knowledge from a brief
+natural-language description of the task goal. GLM2FSA first sends queries to a
+GLM to extract task knowledge in textual form, and then it builds an FSA to
+represent this text-based knowledge. The proposed algorithm thus fills the gap
+between natural-language task descriptions and automaton-based representations,
+and the constructed FSA can be formally verified against user-defined
+specifications. We accordingly propose a method to iteratively refine the
+queries to the GLM based on the outcomes, e.g., counter-examples, from
+verification. We demonstrate GLM2FSA's ability to build and refine
+automaton-based representations of everyday tasks (e.g., crossing a road), and
+also of tasks that require highly-specialized knowledge (e.g., executing secure
+multi-party computation).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to JAIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's have a chat! A Conversation with Chat<span class="highlight-title">GPT</span>: Technology,
+  Applications, and Limitations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13817v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13817v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sakib Shahriar, Kadhim Hayawi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of an AI-powered chatbot that can generate human-like sentences
+and write coherent essays has caught the world's attention. This paper
+discusses the historical overview of chatbots and the technology behind Chat
+Generative Pre-trained Transformer, better known as ChatGPT. Moreover,
+potential applications of ChatGPT in various domains, including healthcare,
+education, and research, are highlighted. Despite promising results, there are
+several privacy and ethical concerns surrounding ChatGPT. In addition, we
+highlight some of the important limitations of the current version of ChatGPT.
+We also ask ChatGPT to provide its point of view and present its responses to
+several questions we attempt to answer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript has been accepted by Artificial Intelligence and
+  Applications (AIA, ISSN: 2811-0854),
+  https://doi.org/10.47852/bonviewAIA3202939, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">131</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scene-Generalizable Interactive Segmentation of Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Tang, Wenjie Pei, Xin Tao, Tanghui Jia, Guangming Lu, Yu-Wing Tai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for interactive segmentation in radiance fields entail
+scene-specific optimization and thus cannot generalize across different scenes,
+which greatly limits their applicability. In this work we make the first
+attempt at Scene-Generalizable Interactive Segmentation in Radiance Fields
+(SGISRF) and propose a novel SGISRF method, which can perform 3D object
+segmentation for novel (unseen) scenes represented by radiance fields, guided
+by only a few interactive user clicks in a given set of multi-view 2D images.
+In particular, the proposed SGISRF focuses on addressing three crucial
+challenges with three specially designed techniques. First, we devise the
+Cross-Dimension Guidance Propagation to encode the scarce 2D user clicks into
+informative 3D guidance representations. Second, the Uncertainty-Eliminated 3D
+Segmentation module is designed to achieve efficient yet effective 3D
+segmentation. Third, Concealment-Revealed Supervised Learning scheme is
+proposed to reveal and correct the concealed 3D segmentation errors resulted
+from the supervision in 2D space with only 2D mask annotations. Extensive
+experiments on two real-world challenging benchmarks covering diverse scenes
+demonstrate 1) effectiveness and scene-generalizability of the proposed method,
+2) favorable performance compared to classical method requiring scene-specific
+optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LayoutLLM-T2I: Eliciting Layout Guidance from LLM for Text-to-Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leigang Qu, Shengqiong Wu, Hao Fei, Liqiang Nie, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the text-to-image generation field, recent remarkable progress in Stable
+Diffusion makes it possible to generate rich kinds of novel photorealistic
+images. However, current models still face misalignment issues (e.g.,
+problematic spatial relation understanding and numeration failure) in complex
+natural scenes, which impedes the high-faithfulness text-to-image generation.
+Although recent efforts have been made to improve controllability by giving
+fine-grained guidance (e.g., sketch and scribbles), this issue has not been
+fundamentally tackled since users have to provide such guidance information
+manually. In this work, we strive to synthesize high-fidelity images that are
+semantically aligned with a given textual prompt without any guidance. Toward
+this end, we propose a coarse-to-fine paradigm to achieve layout planning and
+image generation. Concretely, we first generate the coarse-grained layout
+conditioned on a given textual prompt via in-context learning based on Large
+Language Models. Afterward, we propose a fine-grained object-interaction
+diffusion method to synthesize high-faithfulness images conditioned on the
+prompt and the automatically generated layout. Extensive experiments
+demonstrate that our proposed method outperforms the state-of-the-art models in
+terms of layout and image generation. Our code and settings are available at
+\url{https://layoutllm-t2i.github.io}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A degree of image identification at sub-human scales could be possible
+  with more advanced clusters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Y J
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of the research is to determine if currently available
+self-supervised learning techniques can accomplish human level comprehension of
+visual images using the same degree and amount of sensory input that people
+acquire from. Initial research on this topic solely considered data volume
+scaling. Here, we scale both the volume of data and the quality of the image.
+This scaling experiment is a self-supervised learning method that may be done
+without any outside financing. We find that scaling up data volume and picture
+resolution at the same time enables human-level item detection performance at
+sub-human sizes.We run a scaling experiment with vision transformers trained on
+up to 200000 images up to 256 ppi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, public code and model:
+  https://github.com/PrateekJannu/imagescale2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic
+  Role Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhao, Hao Fei, Yixin Cao, Bobo Li, Meishan Zhang, Jianguo Wei, Min Zhang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Semantic Role Labeling (VidSRL) aims to detect the salient events from
+given videos, by recognizing the predict-argument event structures and the
+interrelationships between events. While recent endeavors have put forth
+methods for VidSRL, they can be mostly subject to two key drawbacks, including
+the lack of fine-grained spatial scene perception and the insufficiently
+modeling of video temporality. Towards this end, this work explores a novel
+holistic spatio-temporal scene graph (namely HostSG) representation based on
+the existing dynamic scene graph structures, which well model both the
+fine-grained spatial semantics and temporal dynamics of videos for VidSRL.
+Built upon the HostSG, we present a nichetargeting VidSRL framework. A
+scene-event mapping mechanism is first designed to bridge the gap between the
+underlying scene structure and the high-level event semantic structure,
+resulting in an overall hierarchical scene-event (termed ICE) graph structure.
+We further perform iterative structure refinement to optimize the ICE graph,
+such that the overall structure representation can best coincide with end task
+demand. Finally, three subtask predictions of VidSRL are jointly decoded, where
+the end-to-end paradigm effectively avoids error propagation. On the benchmark
+dataset, our framework boosts significantly over the current best-performing
+model. Further analyses are shown for a better understanding of the advances of
+our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drones4Good: Supporting Disaster Relief Through Remote Sensing and AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Merkle, Reza Bahmanyar, Corentin Henry, Seyed Majid Azimi, Xiangtian Yuan, Simon Schopferer, Veronika Gstaiger, Stefan Auer, Anne Schneibel, Marc Wieland, Thomas Kraft
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to respond effectively in the aftermath of a disaster, emergency
+services and relief organizations rely on timely and accurate information about
+the affected areas. Remote sensing has the potential to significantly reduce
+the time and effort required to collect such information by enabling a rapid
+survey of large areas. To achieve this, the main challenge is the automatic
+extraction of relevant information from remotely sensed data. In this work, we
+show how the combination of drone-based data with deep learning methods enables
+automated and large-scale situation assessment. In addition, we demonstrate the
+integration of onboard image processing techniques for the deployment of
+autonomous drone-based aid delivery. The results show the feasibility of a
+rapid and large-scale image analysis in the field, and that onboard image
+processing can increase the safety of drone-based aid deliveries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Volumetric Fast Fourier Convolution for Detecting Ink on the Carbonized
+  Herculaneum Papyri <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Quattrini, Vittorio Pippi, Silvia Cascianelli, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Digital Document Restoration (DDR) have led to
+significant breakthroughs in analyzing highly damaged written artifacts. Among
+those, there has been an increasing interest in applying Artificial
+Intelligence techniques for virtually unwrapping and automatically detecting
+ink on the Herculaneum papyri collection. This collection consists of
+carbonized scrolls and fragments of documents, which have been digitized via
+X-ray tomography to allow the development of ad-hoc deep learning-based DDR
+solutions. In this work, we propose a modification of the Fast Fourier
+Convolution operator for volumetric data and apply it in a segmentation
+architecture for ink detection on the challenging Herculaneum papyri,
+demonstrating its suitability via deep experimental analysis. To encourage the
+research on this task and the application of the proposed operator to other
+tasks involving volumetric data, we will release our implementation
+(https://github.com/aimagelab/vffc)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 4th ICCV Workshop on e-Heritage (in conjunction with
+  ICCV 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometric Learning-Based <span class="highlight-title">Transformer</span> Network for Estimation of
+  Segmentation Errors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sneha Sree C, Mohammad Al Fahim, Keerthi Ram, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many segmentation networks have been proposed for 3D volumetric segmentation
+of tumors and organs at risk. Hospitals and clinical institutions seek to
+accelerate and minimize the efforts of specialists in image segmentation.
+Still, in case of errors generated by these networks, clinicians would have to
+manually edit the generated segmentation maps. Given a 3D volume and its
+putative segmentation map, we propose an approach to identify and measure
+erroneous regions in the segmentation map. Our method can estimate error at any
+point or node in a 3D mesh generated from a possibly erroneous volumetric
+segmentation map, serving as a Quality Assurance tool. We propose a graph
+neural network-based transformer based on the Nodeformer architecture to
+measure and classify the segmentation errors at any point. We have evaluated
+our network on a high-resolution micro-CT dataset of the human inner-ear bony
+labyrinth structure by simulating erroneous 3D segmentation maps. Our network
+incorporates a convolutional encoder to compute node-centric features from the
+input micro-CT data, the Nodeformer to learn the latent graph embeddings, and a
+Multi-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our
+network achieves a mean absolute error of ~0.042 over other Graph Neural
+Networks (GNN) and an accuracy of 79.53% over other GNNs in estimating and
+classifying the node-wise errors, respectively. We also put forth vertex-normal
+prediction as a custom pretext task for pre-training the CNN encoder to improve
+the network's overall performance. Qualitative analysis shows the efficiency of
+our network in correctly classifying errors and reducing misclassifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Method for improving accuracy in neural network by reinstating
+  traditional back propagation technique 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gokulprasath R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has revolutionized industries like computer vision, natural
+language processing, and speech recognition. However, back propagation, the
+main method for training deep neural networks, faces challenges like
+computational overhead and vanishing gradients. In this paper, we propose a
+novel instant parameter update methodology that eliminates the need for
+computing gradients at each layer. Our approach accelerates learning, avoids
+the vanishing gradient problem, and outperforms state-of-the-art methods on
+benchmark data sets. This research presents a promising direction for efficient
+and effective deep neural network training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAT: Position-Aware <span class="highlight-title">Transformer</span> for Dense Multi-Label Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faegheh Sardari, Armin Mustafa, Philip J. B. Jackson, Adrian Hilton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present PAT, a transformer-based network that learns complex temporal
+co-occurrence action dependencies in a video by exploiting multi-scale temporal
+features. In existing methods, the self-attention mechanism in transformers
+loses the temporal positional information, which is essential for robust action
+detection. To address this issue, we (i) embed relative positional encoding in
+the self-attention mechanism and (ii) exploit multi-scale temporal
+relationships by designing a novel non hierarchical network, in contrast to the
+recent transformer-based approaches that use a hierarchical structure. We argue
+that joining the self-attention mechanism with multiple sub-sampling processes
+in the hierarchical approaches results in increased loss of positional
+information. We evaluate the performance of our proposed approach on two
+challenging dense multi-label benchmark datasets, and show that PAT improves
+the current state-of-the-art result by 1.1% and 0.6% mAP on the Charades and
+MultiTHUMOS datasets, respectively, thereby achieving the new state-of-the-art
+mAP at 26.5% and 44.6%, respectively. We also perform extensive ablation
+studies to examine the impact of the different components of our proposed
+network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Density Crop-guided Semi-supervised Object Detection in Aerial Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil Meethal, Eric Granger, Marco Pedersoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the important bottlenecks in training modern object detectors is the
+need for labeled images where bounding box annotations have to be produced for
+each object present in the image. This bottleneck is further exacerbated in
+aerial images where the annotators have to label small objects often
+distributed in clusters on high-resolution images. In recent days, the
+mean-teacher approach trained with pseudo-labels and weak-strong augmentation
+consistency is gaining popularity for semi-supervised object detection.
+However, a direct adaptation of such semi-supervised detectors for aerial
+images where small clustered objects are often present, might not lead to
+optimal results. In this paper, we propose a density crop-guided
+semi-supervised detector that identifies the cluster of small objects during
+training and also exploits them to improve performance at inference. During
+training, image crops of clusters identified from labeled and unlabeled images
+are used to augment the training set, which in turn increases the chance of
+detecting small objects and creating good pseudo-labels for small objects on
+the unlabeled images. During inference, the detector is not only able to detect
+the objects of interest but also regions with a high density of small objects
+(density crops) so that detections from the input image and detections from
+image crops are combined, resulting in an overall more accurate object
+prediction, especially for small objects. Empirical studies on the popular
+benchmarks of VisDrone and DOTA datasets show the effectiveness of our density
+crop-guided semi-supervised detector with an average improvement of more than
+2\% over the basic mean-teacher method in COCO style AP. Our code is available
+at: https://github.com/akhilpm/DroneSSOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-End Framework of Road User Detection, Tracking, and Prediction
+  from Monocular Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Cheng, Mengmeng liu, Lin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perception that involves multi-object detection and tracking, and trajectory
+prediction are two major tasks of autonomous driving. However, they are
+currently mostly studied separately, which results in most trajectory
+prediction modules being developed based on ground truth trajectories without
+taking into account that trajectories extracted from the detection and tracking
+modules in real-world scenarios are noisy. These noisy trajectories can have a
+significant impact on the performance of the trajectory predictor and can lead
+to serious prediction errors. In this paper, we build an end-to-end framework
+for detection, tracking, and trajectory prediction called ODTP (Online
+Detection, Tracking and Prediction). It adopts the state-of-the-art online
+multi-object tracking model, QD-3DT, for perception and trains the trajectory
+predictor, DCENet++, directly based on the detection results without purely
+relying on ground truth trajectories. We evaluate the performance of ODTP on
+the widely used nuScenes dataset for autonomous driving. Extensive experiments
+show that ODPT achieves high performance end-to-end trajectory prediction.
+DCENet++, with the enhanced dynamic maps, predicts more accurate trajectories
+than its base model. It is also more robust when compared with other generative
+and deterministic trajectory prediction models trained on noisy detection
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Modulation <span class="highlight-title">Transformer</span>: Cross-Refinement of Global
+  Representation via High-Frequency Prior for Image Super-Resolution <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Li, Le Zhang, Yun Liu, Ce Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based methods have exhibited remarkable potential in single image
+super-resolution (SISR) by effectively extracting long-range dependencies.
+However, most of the current research in this area has prioritized the design
+of transformer blocks to capture global information, while overlooking the
+importance of incorporating high-frequency priors, which we believe could be
+beneficial. In our study, we conducted a series of experiments and found that
+transformer structures are more adept at capturing low-frequency information,
+but have limited capacity in constructing high-frequency representations when
+compared to their convolutional counterparts. Our proposed solution, the
+cross-refinement adaptive feature modulation transformer (CRAFT), integrates
+the strengths of both convolutional and transformer structures. It comprises
+three key components: the high-frequency enhancement residual block (HFERB) for
+extracting high-frequency information, the shift rectangle window attention
+block (SRWAB) for capturing global information, and the hybrid fusion block
+(HFB) for refining the global representation. Our experiments on multiple
+datasets demonstrate that CRAFT outperforms state-of-the-art methods by up to
+0.29dB while using fewer parameters. The source code will be made available at:
+https://github.com/AVC2-UESTC/CRAFT-SR.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Diffusion Models Suffer Error Propagation? Theoretical Analysis and
+  Consistency Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangming Li, Zhaozhi Qian, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While diffusion models have achieved promising performances in data
+synthesis, they might suffer error propagation because of their cascade
+structure, where the distributional mismatch spreads and magnifies through the
+chain of denoising modules. However, a strict analysis is expected since many
+sequential models such as Conditional Random Field (CRF) are free from error
+propagation. In this paper, we empirically and theoretically verify that
+diffusion models are indeed affected by error propagation and we then propose a
+regularization to address this problem. Our theoretical analysis reveals that
+the question can be reduced to whether every denoising module of the diffusion
+model is fault-tolerant. We derive insightful transition equations, indicating
+that the module can't recover from input errors and even propagates additional
+errors to the next module. Our analysis directly leads to a consistency
+regularization scheme for diffusion models, which explicitly reduces the
+distribution gap between forward and backward processes. We further introduce a
+bootstrapping algorithm to reduce the computation cost of the regularizer. Our
+experimental results on multiple image datasets show that our regularization
+effectively handles error propagation and significantly improves the
+performance of vanilla diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Model Transfer in Forest Mapping using Multi-source
+  Satellite SAR and Optical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojia Ge, Oleg Antropov, Tuomas Häme, Ronald E. McRoberts, Jukka Miettinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) models are gaining popularity in forest variable
+prediction using Earth Observation images. However, in practical forest
+inventories, reference datasets are often represented by plot- or stand-level
+measurements, while high-quality representative wall-to-wall reference data for
+end-to-end training of DL models are rarely available. Transfer learning
+facilitates expansion of the use of deep learning models into areas with
+sub-optimal training data by allowing pretraining of the model in areas where
+high-quality teaching data are available. In this study, we perform a "model
+transfer" (or domain adaptation) of a pretrained DL model into a target area
+using plot-level measurements and compare performance versus other machine
+learning models. We use an earlier developed UNet based model (SeUNet) to
+demonstrate the approach on two distinct taiga sites with varying forest
+structure and composition. Multisource Earth Observation (EO) data are
+represented by a combination of Copernicus Sentinel-1 C-band SAR and Sentinel-2
+multispectral images, JAXA ALOS-2 PALSAR-2 SAR mosaic and TanDEM-X bistatic
+interferometric radar data. The training study site is located in Finnish
+Lapland, while the target site is located in Southern Finland. By leveraging
+transfer learning, the prediction of SeUNet achieved root mean squared error
+(RMSE) of 2.70 m and R$^2$ of 0.882, considerably more accurate than
+traditional benchmark methods. We expect such forest-specific DL model transfer
+can be suitable also for other forest variables and other EO data sources that
+are sensitive to forest structure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDiff-Face: Synthetic-based Face Recognition through Fizzy
+  Identity-Conditioned Diffusion Models <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadi Boutros, Jonas Henry Grebe, Arjan Kuijper, Naser Dame
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of large-scale authentic face databases has been crucial to
+the significant advances made in face recognition research over the past
+decade. However, legal and ethical concerns led to the recent retraction of
+many of these databases by their creators, raising questions about the
+continuity of future face recognition research without one of its key
+resources. Synthetic datasets have emerged as a promising alternative to
+privacy-sensitive authentic data for face recognition development. However,
+recent synthetic datasets that are used to train face recognition models suffer
+either from limitations in intra-class diversity or cross-class (identity)
+discrimination, leading to less optimal accuracies, far away from the
+accuracies achieved by models trained on authentic data. This paper targets
+this issue by proposing IDiff-Face, a novel approach based on conditional
+latent diffusion models for synthetic identity generation with realistic
+identity variations for face recognition training. Through extensive
+evaluations, our proposed synthetic-based face recognition approach pushed the
+limits of state-of-the-art performances, achieving, for example, 98.00%
+accuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the
+recent synthetic-based face recognition solutions with 95.40% and bridging the
+gap to authentic-based face recognition with 99.82% accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foreground Object Search by Distilling Composite Image Feature <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Jiacheng Sui, Li Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foreground object search (FOS) aims to find compatible foreground objects for
+a given background image, producing realistic composite image. We observe that
+competitive retrieval performance could be achieved by using a discriminator to
+predict the compatibility of composite image, but this approach has
+unaffordable time cost. To this end, we propose a novel FOS method via
+distilling composite feature (DiscoFOS). Specifically, the abovementioned
+discriminator serves as teacher network. The student network employs two
+encoders to extract foreground feature and background feature. Their
+interaction output is enforced to match the composite image feature from the
+teacher network. Additionally, previous works did not release their datasets,
+so we contribute two datasets for FOS task: S-FOSD dataset with synthetic
+composite images and R-FOSD dataset with real composite images. Extensive
+experiments on our two datasets demonstrate the superiority of the proposed
+method over previous approaches. The dataset and code are available at
+https://github.com/bcmi/Foreground-Object-Search-Dataset-FOSD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Landmark Learning with Deformation Reconstruction and
+  Cross-subject Consistency Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Hung Chao, Marc Niethammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A Point Distribution Model (PDM) is the basis of a Statistical Shape Model
+(SSM) that relies on a set of landmark points to represent a shape and
+characterize the shape variation. In this work, we present a self-supervised
+approach to extract landmark points from a given registration model for the
+PDMs. Based on the assumption that the landmarks are the points that have the
+most influence on registration, existing works learn a point-based registration
+model with a small number of points to estimate the landmark points that
+influence the deformation the most. However, such approaches assume that the
+deformation can be captured by point-based registration and quality landmarks
+can be learned solely with the deformation capturing objective. We argue that
+data with complicated deformations can not easily be modeled with point-based
+registration when only a limited number of points is used to extract
+influential landmark points. Further, landmark consistency is not assured in
+existing approaches In contrast, we propose to extract landmarks based on a
+given registration model, which is tailored for the target data, so we can
+obtain more accurate correspondences. Secondly, to establish the anatomical
+consistency of the predicted landmarks, we introduce a landmark discovery loss
+to explicitly encourage the model to predict the landmarks that are
+anatomically consistent across subjects. We conduct experiments on an
+osteoarthritis progression prediction task and show our method outperforms
+existing image-based and point-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACE-HetEM for ab initio Heterogenous Cryo-EM 3D Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Chen, Lin Yao, Zeqing Xia, Yuhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the extremely low signal-to-noise ratio (SNR) and unknown poses
+(projection angles and image translation) in cryo-EM experiments,
+reconstructing 3D structures from 2D images is very challenging. On top of
+these challenges, heterogeneous cryo-EM reconstruction also has an additional
+requirement: conformation classification. An emerging solution to this problem
+is called amortized inference, implemented using the autoencoder architecture
+or its variants. Instead of searching for the correct
+image-to-pose/conformation mapping for every image in the dataset as in
+non-amortized methods, amortized inference only needs to train an encoder that
+maps images to appropriate latent spaces representing poses or conformations.
+Unfortunately, standard amortized-inference-based methods with entangled latent
+spaces have difficulty learning the distribution of conformations and poses
+from cryo-EM images. In this paper, we propose an unsupervised deep learning
+architecture called "ACE-HetEM" based on amortized inference. To explicitly
+enforce the disentanglement of conformation classifications and pose
+estimations, we designed two alternating training tasks in our method:
+image-to-image task and pose-to-pose task. Results on simulated datasets show
+that ACE-HetEM has comparable accuracy in pose estimation and produces even
+better reconstruction resolution than non-amortized methods. Furthermore, we
+show that ACE-HetEM is also applicable to real experimental datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prototypical Kernel Learning and Open-set Foreground Perception for
+  Generalized Few-shot Semantic Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Huang, Feigege Wang, Ye Xi, Yutao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic
+Segmentation (FSS) to simultaneously segment unseen classes and seen classes
+during evaluation. Previous works leverage additional branch or prototypical
+aggregation to eliminate the constrained setting of FSS. However,
+representation division and embedding prejudice, which heavily results in poor
+performance of GFSS, have not been synthetical considered. We address the
+aforementioned problems by jointing the prototypical kernel learning and
+open-set foreground perception. Specifically, a group of learnable kernels is
+proposed to perform segmentation with each kernel in charge of a stuff class.
+Then, we explore to merge the prototypical learning to the update of base-class
+kernels, which is consistent with the prototype knowledge aggregation of
+few-shot novel classes. In addition, a foreground contextual perception module
+cooperating with conditional bias based inference is adopted to perform
+class-agnostic as well as open-set foreground detection, thus to mitigate the
+embedding prejudice and prevent novel targets from being misclassified as
+background. Moreover, we also adjust our method to the Class Incremental
+Few-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel
+classes in a incremental stream. Extensive experiments on PASCAL-5i and
+COCO-20i datasets demonstrate that our method performs better than previous
+state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Branches Mutual Promotion for End-to-End Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Zhu, Hangzhou He, Xinliang Zhang, Qian Chen, Shuang Zeng, Qiushi Ren, Yanye Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end weakly supervised semantic segmentation aims at optimizing a
+segmentation model in a single-stage training process based on only image
+annotations. Existing methods adopt an online-trained classification branch to
+provide pseudo annotations for supervising the segmentation branch. However,
+this strategy makes the classification branch dominate the whole concurrent
+training process, hindering these two branches from assisting each other. In
+our work, we treat these two branches equally by viewing them as diverse ways
+to generate the segmentation map, and add interactions on both their
+supervision and operation to achieve mutual promotion. For this purpose, a
+bidirectional supervision mechanism is elaborated to force the consistency
+between the outputs of these two branches. Thus, the segmentation branch can
+also give feedback to the classification branch to enhance the quality of
+localization seeds. Moreover, our method also designs interaction operations
+between these two branches to exchange their knowledge to assist each other.
+Experiments indicate our work outperforms existing end-to-end weakly supervised
+segmentation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelectNAdapt: Support Set Selection for Few-Shot Domain Adaptation <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Dawoud, Gustavo Carneiro, Vasileios Belagiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalisation of deep neural networks becomes vulnerable when distribution
+shifts are encountered between train (source) and test (target) domain data.
+Few-shot domain adaptation mitigates this issue by adapting deep neural
+networks pre-trained on the source domain to the target domain using a randomly
+selected and annotated support set from the target domain. This paper argues
+that randomly selecting the support set can be further improved for effectively
+adapting the pre-trained source models to the target domain. Alternatively, we
+propose SelectNAdapt, an algorithm to curate the selection of the target domain
+samples, which are then annotated and included in the support set. In
+particular, for the K-shot adaptation problem, we first leverage
+self-supervision to learn features of the target domain data. Then, we propose
+a per-class clustering scheme of the learned target domain features and select
+K representative target samples using a distance-based scoring function.
+Finally, we bring our selection setup towards a practical ground by relying on
+pseudo-labels for clustering semantically similar target domain samples. Our
+experiments show promising results on three few-shot domain adaptation
+benchmarks for image recognition compared to related approaches and the
+standard random selection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Image Anomaly Detection with Greedy Eigencomponent Selection <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tetiana Gula, João P C Bertoldo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) in images, identifying significant deviations from
+normality, is a critical issue in computer vision. This paper introduces a
+novel approach to dimensionality reduction for AD using pre-trained
+convolutional neural network (CNN) that incorporate EfficientNet models. We
+investigate the importance of component selection and propose two types of tree
+search approaches, both employing a greedy strategy, for optimal eigencomponent
+selection. Our study conducts three main experiments to evaluate the
+effectiveness of our approach. The first experiment explores the influence of
+test set performance on component choice, the second experiment examines the
+performance when we train on one anomaly type and evaluate on all other types,
+and the third experiment investigates the impact of using a minimum number of
+images for training and selecting them based on anomaly types. Our approach
+aims to find the optimal subset of components that deliver the highest
+performance score, instead of focusing solely on the proportion of variance
+explained by each component and also understand the components behaviour in
+different settings. Our results indicate that the proposed method surpasses
+both Principal Component Analysis (PCA) and Negated Principal Component
+Analysis (NPCA) in terms of detection accuracy, even when using fewer
+components. Thus, our approach provides a promising alternative to conventional
+dimensionality reduction techniques in AD, and holds potential to enhance the
+efficiency and effectiveness of AD systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 14 figures, accepted to 2023 official workshop of the
+  LatinX in Computer Vision (LXCV) at ICCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JEDI: Joint Expert Distillation in a Semi-Supervised Multi-<span class="highlight-title">Dataset</span>
+  Student-Teacher Scenario for Video Action Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucian Bicsi, Bogdan Alexe, Radu Tudor Ionescu, Marius Leordeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose JEDI, a multi-dataset semi-supervised learning method, which
+efficiently combines knowledge from multiple experts, learned on different
+datasets, to train and improve the performance of individual, per dataset,
+student models. Our approach achieves this by addressing two important problems
+in current machine learning research: generalization across datasets and
+limitations of supervised training due to scarcity of labeled data. We start
+with an arbitrary number of experts, pretrained on their own specific dataset,
+which form the initial set of student models. The teachers are immediately
+derived by concatenating the feature representations from the penultimate
+layers of the students. We then train all models in a student-teacher
+semi-supervised learning scenario until convergence. In our efficient approach,
+student-teacher training is carried out jointly and end-to-end, showing that
+both students and teachers improve their generalization capacity during
+training. We validate our approach on four video action recognition datasets.
+By simultaneously considering all datasets within a unified semi-supervised
+setting, we demonstrate significant improvements over the initial experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV 2023 Workshops</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeodesicPSIM: Predicting the Quality of Static Mesh with Texture Map via
+  Geodesic Patch Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Yang, Joel Jung, Xiaozhong Xu, Shan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Static meshes with texture maps have attracted considerable attention in both
+industrial manufacturing and academic research, leading to an urgent
+requirement for effective and robust objective quality evaluation. However,
+current model-based static mesh quality metrics have obvious limitations: most
+of them only consider geometry information, while color information is ignored,
+and they have strict constraints for the meshes' geometrical topology. Other
+metrics, such as image-based and point-based metrics, are easily influenced by
+the prepossessing algorithms, e.g., projection and sampling, hampering their
+ability to perform at their best. In this paper, we propose Geodesic Patch
+Similarity (GeodesicPSIM), a novel model-based metric to accurately predict
+human perception quality for static meshes. After selecting a group keypoints,
+1-hop geodesic patches are constructed based on both the reference and
+distorted meshes cleaned by an effective mesh cleaning algorithm. A two-step
+patch cropping algorithm and a patch texture mapping module refine the size of
+1-hop geodesic patches and build the relationship between the mesh geometry and
+color information, resulting in the generation of 1-hop textured geodesic
+patches. Three types of features are extracted to quantify the distortion:
+patch color smoothness, patch discrete mean curvature, and patch pixel color
+average and variance. To the best of our knowledge, GeodesicPSIM is the first
+model-based metric especially designed for static meshes with texture maps.
+GeodesicPSIM provides state-of-the-art performance in comparison with
+image-based, point-based, and video-based metrics on a newly created and
+challenging database. We also prove the robustness of GeodesicPSIM by
+introducing different settings of hyperparameters. Ablation studies also
+exhibit the effectiveness of three proposed features and the patch cropping
+algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-Based Prediction of Fractional Flow Reserve along the
+  Coronary Artery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nils Hampe, Sanne G. M. van Velzen, Jean-Paul Aben, Carlos Collet, Ivana Išgum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Functionally significant coronary artery disease (CAD) is caused by plaque
+buildup in the coronary arteries, potentially leading to narrowing of the
+arterial lumen, i.e. coronary stenosis, that significantly obstructs blood flow
+to the myocardium. The current reference for establishing the presence of a
+functionally significant stenosis is invasive fractional flow reserve (FFR)
+measurement. To avoid invasive measurements, non-invasive prediction of FFR
+from coronary CT angiography (CCTA) has emerged. For this, machine learning
+approaches, characterized by fast inference, are increasingly developed.
+However, these methods predict a single FFR value per artery i.e. they don't
+provide information about the stenosis location or treatment strategy. We
+propose a deep learning-based method to predict the FFR along the artery from
+CCTA scans. This study includes CCTA images of 110 patients who underwent
+invasive FFR pullback measurement in 112 arteries. First, a multi planar
+reconstruction (MPR) of the artery is fed to a variational autoencoder to
+characterize the artery, i.e. through the lumen area and unsupervised artery
+encodings. Thereafter, a convolutional neural network (CNN) predicts the FFR
+along the artery. The CNN is supervised by multiple loss functions, notably a
+loss function inspired by the Earth Mover's Distance (EMD) to predict the
+correct location of FFR drops and a histogram-based loss to explicitly
+supervise the slope of the FFR curve. To train and evaluate our model,
+eight-fold cross-validation was performed. The resulting FFR curves show good
+agreement with the reference allowing the distinction between diffuse and focal
+CAD distributions in most cases. Quantitative evaluation yielded a mean
+absolute difference in the area under the FFR pullback curve (AUPC) of 1.7. The
+method may pave the way towards fast, accurate, automatic prediction of FFR
+along the artery from CCTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-view Semantic Alignment for Livestreaming Product Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Yang, Yiyi Chen, Yan Li, Yanhua Cheng, Xudong Liu, Quan Chen, Han Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Live commerce is the act of selling products online through live streaming.
+The customer's diverse demands for online products introduce more challenges to
+Livestreaming Product Recognition. Previous works have primarily focused on
+fashion clothing data or utilize single-modal input, which does not reflect the
+real-world scenario where multimodal data from various categories are present.
+In this paper, we present LPR4M, a large-scale multimodal dataset that covers
+34 categories, comprises 3 modalities (image, video, and text), and is 50?
+larger than the largest publicly available dataset. LPR4M contains diverse
+videos and noise modality pairs while exhibiting a long-tailed distribution,
+resembling real-world problems. Moreover, a cRoss-vIew semantiC alignmEnt
+(RICE) model is proposed to learn discriminative instance features from the
+image and video views of the products. This is achieved through instance-level
+contrastive learning and cross-view patch-level feature propagation. A novel
+Patch Feature Reconstruction loss is proposed to penalize the semantic
+misalignment between cross-view patches. Extensive experiments demonstrate the
+effectiveness of RICE and provide insights into the importance of dataset
+diversity and expressivity. The dataset and code are available at
+https://github.com/adxcreative/RICE
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLPT: Selective Labeling Meets <span class="highlight-title">Prompt</span> Tuning on Label-Limited Lesion
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Bai, Ke Yan, Xiaoyu Bai, Xinyu Mao, Xiaoli Yin, Jingren Zhou, Yu Shi, Le Lu, Max Q. -H. Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image analysis using deep learning is often challenged by limited
+labeled data and high annotation costs. Fine-tuning the entire network in
+label-limited scenarios can lead to overfitting and suboptimal performance.
+Recently, prompt tuning has emerged as a more promising technique that
+introduces a few additional tunable parameters as prompts to a task-agnostic
+pre-trained model, and updates only these parameters using supervision from
+limited labeled data while keeping the pre-trained model unchanged. However,
+previous work has overlooked the importance of selective labeling in downstream
+tasks, which aims to select the most valuable downstream samples for annotation
+to achieve the best performance with minimum annotation cost. To address this,
+we propose a framework that combines selective labeling with prompt tuning
+(SLPT) to boost performance in limited labels. Specifically, we introduce a
+feature-aware prompt updater to guide prompt tuning and a TandEm Selective
+LAbeling (TESLA) strategy. TESLA includes unsupervised diversity selection and
+supervised selection using prompt-based uncertainty. In addition, we propose a
+diversified visual prompt tuning strategy to provide multi-prompt-based
+discrepant predictions for TESLA. We evaluate our method on liver tumor
+segmentation and achieve state-of-the-art performance, outperforming
+traditional fine-tuning with only 6% of tunable parameters, also achieving 94%
+of full-data performance by labeling only 5% of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StableVQA: A Deep No-Reference Quality Assessment Model for Video
+  Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengchuan Kou, Xiaohong Liu, Wei Sun, Jun Jia, Xiongkuo Min, Guangtao Zhai, Ning Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video shakiness is an unpleasant distortion of User Generated Content (UGC)
+videos, which is usually caused by the unstable hold of cameras. In recent
+years, many video stabilization algorithms have been proposed, yet no specific
+and accurate metric enables comprehensively evaluating the stability of videos.
+Indeed, most existing quality assessment models evaluate video quality as a
+whole without specifically taking the subjective experience of video stability
+into consideration. Therefore, these models cannot measure the video stability
+explicitly and precisely when severe shakes are present. In addition, there is
+no large-scale video database in public that includes various degrees of shaky
+videos with the corresponding subjective scores available, which hinders the
+development of Video Quality Assessment for Stability (VQA-S). To this end, we
+build a new database named StableDB that contains 1,952 diversely-shaky UGC
+videos, where each video has a Mean Opinion Score (MOS) on the degree of video
+stability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S
+model named StableVQA, which consists of three feature extractors to acquire
+the optical flow, semantic, and blur features respectively, and a regression
+layer to predict the final stability score. Extensive experiments demonstrate
+that the StableVQA achieves a higher correlation with subjective opinions than
+the existing VQA-S models and generic VQA models. The database and codes are
+available at https://github.com/QMME/StableVQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Histogram-guided Video Colorization Structure with Spatial-Temporal
+  Connection <span class="chip">ICME</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Liu, Pan Mu, Hanning Xu, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video colorization, aiming at obtaining colorful and plausible results from
+grayish frames, has aroused a lot of interest recently. Nevertheless, how to
+maintain temporal consistency while keeping the quality of colorized results
+remains challenging. To tackle the above problems, we present a
+Histogram-guided Video Colorization with Spatial-Temporal connection structure
+(named ST-HVC). To fully exploit the chroma and motion information, the joint
+flow and histogram module is tailored to integrate the histogram and flow
+features. To manage the blurred and artifact, we design a combination scheme
+attending to temporal detail and flow feature combination. We further recombine
+the histogram, flow and sharpness features via a U-shape network. Extensive
+comparisons are conducted with several state-of-the-art image and video-based
+methods, demonstrating that the developed method achieves excellent performance
+both quantitatively and qualitatively in two video datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages; Accepted at IEEE ICME</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transmission and Color-guided Network for Underwater Image Enhancement <span class="chip">ICME</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Mu, Jing Fang, Haotian Qian, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, with the continuous development of the marine industry,
+underwater image enhancement has attracted plenty of attention. Unfortunately,
+the propagation of light in water will be absorbed by water bodies and
+scattered by suspended particles, resulting in color deviation and low
+contrast. To solve these two problems, we propose an Adaptive Transmission and
+Dynamic Color guided network (named ATDCnet) for underwater image enhancement.
+In particular, to exploit the knowledge of physics, we design an Adaptive
+Transmission-directed Module (ATM) to better guide the network. To deal with
+the color deviation problem, we design a Dynamic Color-guided Module (DCM) to
+post-process the enhanced image color. Further, we design an
+Encoder-Decoder-based Compensation (EDC) structure with attention and a
+multi-stage feature fusion mechanism to perform color restoration and contrast
+enhancement simultaneously. Extensive experiments demonstrate the
+state-of-the-art performance of the ATDCnet on multiple benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages; Accepted at IEEE ICME</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Out-of-Distribution Dialect Detection with Mahalanobis
+  Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourya Dipta Das, Yash Vadi, Abhishek Unnam, Kuldeep Yadav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialect classification is used in a variety of applications, such as machine
+translation and speech recognition, to improve the overall performance of the
+system. In a real-world scenario, a deployed dialect classification model can
+encounter anomalous inputs that differ from the training data distribution,
+also called out-of-distribution (OOD) samples. Those OOD samples can lead to
+unexpected outputs, as dialects of those samples are unseen during model
+training. Out-of-distribution detection is a new research area that has
+received little attention in the context of dialect classification. Towards
+this, we proposed a simple yet effective unsupervised Mahalanobis distance
+feature-based method to detect out-of-distribution samples. We utilize the
+latent embeddings from all intermediate layers of a wav2vec 2.0
+transformer-based dialect classifier model for multi-task learning. Our
+proposed approach outperforms other state-of-the-art OOD detection methods
+significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Networks for Heterogeneous Augmentation of Cranial
+  Defects <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamil Kwarciak, Marek Wodzinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design of personalized cranial implants is a challenging and tremendous
+task that has become a hot topic in terms of process automation with the use of
+deep learning techniques. The main challenge is associated with the high
+diversity of possible cranial defects. The lack of appropriate data sources
+negatively influences the data-driven nature of deep learning algorithms.
+Hence, one of the possible solutions to overcome this problem is to rely on
+synthetic data. In this work, we propose three volumetric variations of deep
+generative models to augment the dataset by generating synthetic skulls, i.e.
+Wasserstein Generative Adversarial Network with Gradient Penalty (WGAN-GP),
+WGAN-GP hybrid with Variational Autoencoder pretraining (VAE/WGAN-GP) and
+Introspective Variational Autoencoder (IntroVAE). We show that it is possible
+to generate dozens of thousands of defective skulls with compatible defects
+that achieve a trade-off between defect heterogeneity and the realistic shape
+of the skull. We evaluate obtained synthetic data quantitatively by defect
+segmentation with the use of V-Net and qualitatively by their latent space
+exploration. We show that the synthetically generated skulls highly improve the
+segmentation process compared to using only the original unaugmented data. The
+generated skulls may improve the automatic design of personalized cranial
+implants for real medical cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, upcoming ICCV2023 LIMIT2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning multi-domain feature relation for visible and Long-wave
+  Infrared image patch matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiuwei Zhang, Yanping Li, Zhaoshuai Qi, Yi Sun, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, learning-based algorithms have achieved promising performance on
+cross-spectral image patch matching, which, however, is still far from
+satisfactory for practical application. On the one hand, a lack of large-scale
+dataset with diverse scenes haunts its further improvement for learning-based
+algorithms, whose performances and generalization rely heavily on the dataset
+size and diversity. On the other hand, more emphasis has been put on feature
+relation in the spatial domain whereas the scale dependency between features
+has often been ignored, leading to performance degeneration especially when
+encountering significant appearance variations for cross-spectral patches. To
+address these issues, we publish, to be best of our knowledge, the largest
+visible and Long-wave Infrared (LWIR) image patch matching dataset, termed
+VL-CMIM, which contains 1300 pairs of strictly aligned visible and LWIR images
+and over 2 million patch pairs covering diverse scenes such as asteroid, field,
+country, build, street and water.In addition, a multi-domain feature relation
+learning network (MD-FRN) is proposed. Input by the features extracted from a
+four-branch network, both feature relations in spatial and scale domains are
+learned via a spatial correlation module (SCM) and multi-scale adaptive
+aggregation module (MSAG), respectively. To further aggregate the multi-domain
+relations, a deep domain interactive mechanism (DIM) is applied, where the
+learnt spatial-relation and scale-relation features are exchanged and further
+input into MSCRM and SCM. This mechanism allows our model to learn interactive
+cross-domain feature relations, leading to improved robustness to significant
+appearance changes due to different modality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking Players in a Badminton Court by Two Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Young-Ching Chou, Shen-Ru Zhang, Bo-Wei Chen, Hong-Qi Chen, Cheng-Kuan Lin, Yu-Chee Tseng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a simple method for multi-object tracking (MOT) of
+players in a badminton court. We leverage two off-the-shelf cameras, one on the
+top of the court and the other on the side of the court. The one on the top is
+to track players' trajectories, while the one on the side is to analyze the
+pixel features of players. By computing the correlations between adjacent
+frames and engaging the information of the two cameras, MOT of badminton
+players is obtained. This two-camera approach addresses the challenge of player
+occlusion and overlapping in a badminton court, providing player trajectory
+tracking and multi-angle analysis. The presented system offers insights into
+the positions and movements of badminton players, thus serving as a coaching or
+self-training tool for badminton players to improve their gaming strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Canela, Pol Caselles, Ibrar Malik, Gil Triginer Garces, Eduard Ramon, Jaime García, Jordi Sánchez-Riera, Francesc Moreno-Noguer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in full-head reconstruction have been obtained by optimizing
+a neural field through differentiable surface or volume rendering to represent
+a single scene. While these techniques achieve an unprecedented accuracy, they
+take several minutes, or even hours, due to the expensive optimization process
+required. In this work, we introduce InstantAvatar, a method that recovers
+full-head avatars from few images (down to just one) in a few seconds on
+commodity hardware. In order to speed up the reconstruction process, we propose
+a system that combines, for the first time, a voxel-grid neural field
+representation with a surface renderer. Notably, a naive combination of these
+two techniques leads to unstable optimizations that do not converge to valid
+solutions. In order to overcome this limitation, we present a novel statistical
+model that learns a prior distribution over 3D head signed distance functions
+using a voxel-grid based architecture. The use of this prior model, in
+combination with other design choices, results into a system that achieves 3D
+head reconstructions with comparable accuracy as the state-of-the-art with a
+100x speed-up.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ View while Moving: Efficient Video Recognition in Long-untrimmed Videos <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Tian, Mengyu Yang, Lanshan Zhang, Zhizhen Zhang, Yang Liu, Xiaohui Xie, Xirong Que, Wendong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent adaptive methods for efficient video recognition mostly follow the
+two-stage paradigm of "preview-then-recognition" and have achieved great
+success on multiple video benchmarks. However, this two-stage paradigm involves
+two visits of raw frames from coarse-grained to fine-grained during inference
+(cannot be parallelized), and the captured spatiotemporal features cannot be
+reused in the second stage (due to varying granularity), being not friendly to
+efficiency and computation optimization. To this end, inspired by human
+cognition, we propose a novel recognition paradigm of "View while Moving" for
+efficient long-untrimmed video recognition. In contrast to the two-stage
+paradigm, our paradigm only needs to access the raw frame once. The two phases
+of coarse-grained sampling and fine-grained recognition are combined into
+unified spatiotemporal modeling, showing great performance. Moreover, we
+investigate the properties of semantic units in video and propose a
+hierarchical mechanism to efficiently capture and reason about the unit-level
+and video-level temporal semantics in long-untrimmed videos respectively.
+Extensive experiments on both long-untrimmed and short-trimmed videos
+demonstrate that our approach outperforms state-of-the-art methods in terms of
+accuracy as well as efficiency, yielding new efficiency and accuracy trade-offs
+for video spatiotemporal modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSSR: A Truncated and Signed Square Root Activation Function for Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation functions are essential components of neural networks. In this
+paper, we introduce a new activation function called the Truncated and Signed
+Square Root (TSSR) function. This function is distinctive because it is odd,
+nonlinear, monotone and differentiable. Its gradient is continuous and always
+positive. Thanks to these properties, it has the potential to improve the
+numerical stability of neural networks. Several experiments confirm that the
+proposed TSSR has better performance than other stat-of-the-art activation
+functions. The proposed function has significant implications for the
+development of neural network models and can be applied to a wide range of
+applications in fields such as computer vision, natural language processing,
+and speech recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2307.16389</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style
+  Transfer <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyang Chen, Zhiyong Wu, Runnan Li, Weihong Bao, Jun Ling, Xu Tan, Sheng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current talking face generation methods mainly focus on speech-lip
+synchronization. However, insufficient investigation on the facial talking
+style leads to a lifeless and monotonous avatar. Most previous works fail to
+imitate expressive styles from arbitrary video prompts and ensure the
+authenticity of the generated video. This paper proposes an unsupervised
+variational style transfer model (VAST) to vivify the neutral photo-realistic
+avatars. Our model consists of three key components: a style encoder that
+extracts facial style representations from the given video prompts; a hybrid
+facial expression decoder to model accurate speech-related movements; a
+variational style enhancer that enhances the style space to be highly
+expressive and meaningful. With our essential designs on facial style learning,
+our model is able to flexibly capture the expressive facial style from
+arbitrary video prompts and transfer it onto a personalized image renderer in a
+zero-shot manner. Experimental results demonstrate the proposed approach
+contributes to a more vivid talking avatar with higher authenticity and richer
+expressiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner
+  for Open-World Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Cai, Pengzhen Ren, Yi Zhu, Hang Xu, Jianzhuang Liu, Changlin Li, Guangrun Wang, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, semantic segmentation models trained with image-level text
+supervision have shown promising results in challenging open-world scenarios.
+However, these models still face difficulties in learning fine-grained semantic
+alignment at the pixel level and predicting accurate object masks. To address
+this issue, we propose MixReorg, a novel and straightforward pre-training
+paradigm for semantic segmentation that enhances a model's ability to
+reorganize patches mixed across images, exploring both local visual relevance
+and global semantic coherence. Our approach involves generating fine-grained
+patch-text pairs data by mixing image patches while preserving the
+correspondence between patches and text. The model is then trained to minimize
+the segmentation loss of the mixed images and the two contrastive losses of the
+original and restored features. With MixReorg as a mask learner, conventional
+text-supervised semantic segmentation models can achieve highly generalizable
+pixel-semantic alignment ability, which is crucial for open-world segmentation.
+After training with large-scale image-text data, MixReorg models can be applied
+directly to segment visual objects of arbitrary categories, without the need
+for further fine-tuning. Our proposed framework demonstrates strong performance
+on popular zero-shot semantic segmentation benchmarks, outperforming GroupViT
+by significant margins of 5.0%, 6.2%, 2.5%, and 3.4% mIoU on PASCAL VOC2012,
+PASCAL Context, MS COCO, and ADE20K, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeing in Flowing: Adapting CLIP for Action Recognition with Motion
+  <span class="highlight-title">Prompt</span>s Learning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Wang, Junlong Du, Ke Yan, Shouhong Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Contrastive Language-Image Pre-training (CLIP) has recently shown
+remarkable generalization on "zero-shot" training and has applied to many
+downstream tasks. We explore the adaptation of CLIP to achieve a more efficient
+and generalized action recognition method. We propose that the key lies in
+explicitly modeling the motion cues flowing in video frames. To that end, we
+design a two-stream motion modeling block to capture motion and spatial
+information at the same time. And then, the obtained motion cues are utilized
+to drive a dynamic prompts learner to generate motion-aware prompts, which
+contain much semantic information concerning human actions. In addition, we
+propose a multimodal communication block to achieve a collaborative learning
+and further improve the performance. We conduct extensive experiments on
+HMDB-51, UCF-101, and Kinetics-400 datasets. Our method outperforms most
+existing state-of-the-art methods by a significant margin on "few-shot" and
+"zero-shot" training. We also achieve competitive performance on "closed-set"
+training with extremely few trainable parameters and additional computational
+costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muyu Xu, Fangneng Zhan, Jiahui Zhang, Yingchen Yu, Xiaoqin Zhang, Christian Theobalt, Ling Shao, Shijian Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) has shown impressive performance in novel view
+synthesis via implicit scene representation. However, it usually suffers from
+poor scalability as requiring densely sampled images for each new scene.
+Several studies have attempted to mitigate this problem by integrating
+Multi-View Stereo (MVS) technique into NeRF while they still entail a
+cumbersome fine-tuning process for new scenes. Notably, the rendering quality
+will drop severely without this fine-tuning process and the errors mainly
+appear around the high-frequency features. In the light of this observation, we
+design WaveNeRF, which integrates wavelet frequency decomposition into MVS and
+NeRF to achieve generalizable yet high-quality synthesis without any per-scene
+optimization. To preserve high-frequency information when generating 3D feature
+volumes, WaveNeRF builds Multi-View Stereo in the Wavelet domain by integrating
+the discrete wavelet transform into the classical cascade MVS, which
+disentangles high-frequency information explicitly. With that, disentangled
+frequency features can be injected into classic NeRF via a novel hybrid neural
+renderer to yield faithful high-frequency details, and an intuitive
+frequency-guided sampling strategy can be designed to suppress artifacts around
+high-frequency regions. Extensive experiments over three widely studied
+benchmarks show that WaveNeRF achieves superior generalizable radiance field
+modeling when only given three images as input.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperCoil-Recon: A Hypernetwork-based Adaptive Coil Configuration Task
+  Switching Network for MRI Reconstruction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sriprabha Ramanarayanan, Mohammad Al Fahim, Rahul G. S., Amrit Kumar Jethi, Keerthi Ram, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parallel imaging, a fast MRI technique, involves dynamic adjustments based on
+the configuration i.e. number, positioning, and sensitivity of the coils with
+respect to the anatomy under study. Conventional deep learning-based image
+reconstruction models have to be trained or fine-tuned for each configuration,
+posing a barrier to clinical translation, given the lack of computational
+resources and machine learning expertise for clinicians to train models at
+deployment. Joint training on diverse datasets learns a single weight set that
+might underfit to deviated configurations. We propose, HyperCoil-Recon, a
+hypernetwork-based coil configuration task-switching network for multi-coil MRI
+reconstruction that encodes varying configurations of the numbers of coils in a
+multi-tasking perspective, posing each configuration as a task. The
+hypernetworks infer and embed task-specific weights into the reconstruction
+network, 1) effectively utilizing the contextual knowledge of common and
+varying image features among the various fields-of-view of the coils, and 2)
+enabling generality to unseen configurations at test time. Experiments reveal
+that our approach 1) adapts on the fly to various unseen configurations up to
+32 coils when trained on lower numbers (i.e. 7 to 11) of randomly varying
+coils, and to 120 deviated unseen configurations when trained on 18
+configurations in a single model, 2) matches the performance of coil
+configuration-specific models, and 3) outperforms configuration-invariant
+models with improvement margins of around 1 dB / 0.03 and 0.3 dB / 0.02 in PSNR
+/ SSIM for knee and brain data. Our code is available at
+https://github.com/sriprabhar/HyperCoil-Recon
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICCV 2023 Workshop on Computer Vision for Automated
+  Medical Diagnosis (CVAMD), 8 pages, 2 columns</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint-Relation <span class="highlight-title">Transformer</span> for Multi-Person Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyao Xu, Weibo Mao, Jingze Gong, Chenxin Xu, Siheng Chen, Weidi Xie, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-person motion prediction is a challenging problem due to the dependency
+of motion on both individual past movements and interactions with other people.
+Transformer-based methods have shown promising results on this task, but they
+miss the explicit relation representation between joints, such as skeleton
+structure and pairwise distance, which is crucial for accurate interaction
+modeling. In this paper, we propose the Joint-Relation Transformer, which
+utilizes relation information to enhance interaction modeling and improve
+future motion prediction. Our relation information contains the relative
+distance and the intra-/inter-person physical constraints. To fuse relation and
+joint information, we design a novel joint-relation fusion layer with
+relation-aware attention to update both features. Additionally, we supervise
+the relation information by forecasting future distance. Experiments show that
+our method achieves a 13.4% improvement of 900ms VIM on 3DPW-SoMoF/RC and
+17.8%/12.0% improvement of 3s MPJPE on CMU-Mpcap/MuPoTS-3D dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Unbiased Scene Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Lyu, Lianli Gao, Junlin Xie, Pengpeng Zeng, Yulu Tian, Jie Shao, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Unbiased Scene Graph Generation (USGG) methods only focus on
+addressing the predicate-level imbalance that high-frequency classes dominate
+predictions of rare ones, while overlooking the concept-level imbalance.
+Actually, even if predicates themselves are balanced, there is still a
+significant concept-imbalance within them due to the long-tailed distribution
+of contexts (i.e., subject-object combinations). This concept-level imbalance
+poses a more pervasive and challenging issue compared to the predicate-level
+imbalance since subject-object pairs are inherently complex in combinations.
+Hence, we introduce a novel research problem: Generalized Unbiased Scene Graph
+Generation (G-USGG), which takes into account both predicate-level and
+concept-level imbalance. To the end, we propose the Multi-Concept Learning
+(MCL) framework, which ensures a balanced learning process across rare/
+uncommon/ common concepts. MCL first quantifies the concept-level imbalance
+across predicates in terms of different amounts of concepts, representing as
+multiple concept-prototypes within the same class. It then effectively learns
+concept-prototypes by applying the Concept Regularization (CR) technique.
+Furthermore, to achieve balanced learning over different concepts, we introduce
+the Balanced Prototypical Memory (BPM), which guides SGG models to generate
+balanced representations for concept-prototypes. Extensive experiments
+demonstrate the remarkable efficacy of our model-agnostic strategy in enhancing
+the performance of benchmark models on both VG-SGG and OI-SGG datasets, leading
+to new state-of-the-art achievements in two key aspects: predicate-level
+unbiased relation recognition and concept-level compositional generability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Mobile Privacy and Security: A Face Skin Patch-Based
+  Anti-Spoofing Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiushi Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Facial Recognition System(FRS) is widely applied in areas such as access
+control and mobile payments due to its convenience and high accuracy. The
+security of facial recognition is also highly regarded. The Face anti-spoofing
+system(FAS) for face recognition is an important component used to enhance the
+security of face recognition systems. Traditional FAS used images containing
+identity information to detect spoofing traces, however there is a risk of
+privacy leakage during the transmission and storage of these images. Besides,
+the encryption and decryption of these privacy-sensitive data takes too long
+compared to inference time by FAS model. To address the above issues, we
+propose a face anti-spoofing algorithm based on facial skin patches leveraging
+pure facial skin patch images as input, which contain no privacy information,
+no encryption or decryption is needed for these images. We conduct experiments
+on several public datasets, the results prove that our algorithm has
+demonstrated superiority in both accuracy and speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Scale Memory Comparison for Zero-/Few-Shot Anomaly Detection <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqin Huang, Aofan Jiang, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection has gained considerable attention due to its broad range of
+applications, particularly in industrial defect detection. To address the
+challenges of data collection, researchers have introduced zero-/few-shot
+anomaly detection techniques that require minimal normal images for each
+category. However, complex industrial scenarios often involve multiple objects,
+presenting a significant challenge. In light of this, we propose a
+straightforward yet powerful multi-scale memory comparison framework for
+zero-/few-shot anomaly detection. Our approach employs a global memory bank to
+capture features across the entire image, while an individual memory bank
+focuses on simplified scenes containing a single object. The efficacy of our
+method is validated by its remarkable achievement of 4th place in the zero-shot
+track and 2nd place in the few-shot track of the Visual Anomaly and Novelty
+Detection (VAND) competition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VAND Runner-up Winner in CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PointMBF: A Multi-scale Bidirectional Fusion Network for Unsupervised
+  RGB-D Point Cloud Registration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhi Yuan, Kexue Fu, Zhihao Li, Yucong Meng, Manning Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud registration is a task to estimate the rigid transformation
+between two unaligned scans, which plays an important role in many computer
+vision applications. Previous learning-based works commonly focus on supervised
+registration, which have limitations in practice. Recently, with the advance of
+inexpensive RGB-D sensors, several learning-based works utilize RGB-D data to
+achieve unsupervised registration. However, most of existing unsupervised
+methods follow a cascaded design or fuse RGB-D data in a unidirectional manner,
+which do not fully exploit the complementary information in the RGB-D data. To
+leverage the complementary information more effectively, we propose a network
+implementing multi-scale bidirectional fusion between RGB images and point
+clouds generated from depth images. By bidirectionally fusing visual and
+geometric features in multi-scales, more distinctive deep features for
+correspondence estimation can be obtained, making our registration more
+accurate. Extensive experiments on ScanNet and 3DMatch demonstrate that our
+method achieves new state-of-the-art performance. Code will be released at
+https://github.com/phdymz/PointMBF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Fusion and Distillation for Subgrade Distresses Detection
+  based on 3D-GPR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunpeng Zhou, Kangjie Ning, Haishuai Wang, Zhi Yu, Sheng Zhou, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of 3D ground-penetrating radar (3D-GPR) for subgrade distress
+detection has gained widespread popularity. To enhance the efficiency and
+accuracy of detection, pioneering studies have attempted to adopt automatic
+detection techniques, particularly deep learning. However, existing works
+typically rely on traditional 1D A-scan, 2D B-scan or 3D C-scan data of the
+GPR, resulting in either insufficient spatial information or high computational
+complexity. To address these challenges, we introduce a novel methodology for
+the subgrade distress detection task by leveraging the multi-view information
+from 3D-GPR data. Moreover, we construct a real multi-view image dataset
+derived from the original 3D-GPR data for the detection task, which provides
+richer spatial information compared to A-scan and B-scan data, while reducing
+computational complexity compared to C-scan data. Subsequently, we develop a
+novel \textbf{M}ulti-\textbf{V}iew \textbf{V}usion and \textbf{D}istillation
+framework, \textbf{GPR-MVFD}, specifically designed to optimally utilize the
+multi-view GPR dataset. This framework ingeniously incorporates multi-view
+distillation and attention-based fusion to facilitate significant feature
+extraction for subgrade distresses. In addition, a self-adaptive learning
+mechanism is adopted to stabilize the model training and prevent performance
+degeneration in each branch. Extensive experiments conducted on this new GPR
+benchmark demonstrate the effectiveness and efficiency of our proposed
+framework. Our framework outperforms not only the existing GPR baselines, but
+also the state-of-the-art methods in the fields of multi-view learning,
+multi-modal learning, and knowledge distillation. We will release the
+constructed multi-view GPR dataset with expert-annotated labels and the source
+codes of the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E3-UAV: An Edge-based Energy-Efficient Object Detection System for
+  Unmanned Aerial Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashun Suo, Xingzhou Zhang, Weisong Shi, Wei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the advances in deep learning techniques, the application of
+Unmanned Aerial Vehicle (UAV)-based object detection has proliferated across a
+range of fields, including vehicle counting, fire detection, and city
+monitoring. While most existing research studies only a subset of the
+challenges inherent to UAV-based object detection, there are few studies that
+balance various aspects to design a practical system for energy consumption
+reduction. In response, we present the E3-UAV, an edge-based energy-efficient
+object detection system for UAVs. The system is designed to dynamically support
+various UAV devices, edge devices, and detection algorithms, with the aim of
+minimizing energy consumption by deciding the most energy-efficient flight
+parameters (including flight altitude, flight speed, detection algorithm, and
+sampling rate) required to fulfill the detection requirements of the task. We
+first present an effective evaluation metric for actual tasks and construct a
+transparent energy consumption model based on hundreds of actual flight data to
+formalize the relationship between energy consumption and flight parameters.
+Then we present a lightweight energy-efficient priority decision algorithm
+based on a large quantity of actual flight data to assist the system in
+deciding flight parameters. Finally, we evaluate the performance of the system,
+and our experimental results demonstrate that it can significantly decrease
+energy consumption in real-world scenarios. Additionally, we provide four
+insights that can assist researchers and engineers in their efforts to study
+UAV-based object detection further.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SUnAA: Sparse Unmixing using Archetypal Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behnood Rasti, Alexandre Zouaoui, Julien Mairal, Jocelyn Chanussot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new sparse unmixing technique using archetypal
+analysis (SUnAA). First, we design a new model based on archetypal analysis. We
+assume that the endmembers of interest are a convex combination of endmembers
+provided by a spectral library and that the number of endmembers of interest is
+known. Then, we propose a minimization problem. Unlike most conventional sparse
+unmixing methods, here the minimization problem is non-convex. We minimize the
+optimization objective iteratively using an active set algorithm. Our method is
+robust to the initialization and only requires the number of endmembers of
+interest. SUnAA is evaluated using two simulated datasets for which results
+confirm its better performance over other conventional and advanced techniques
+in terms of signal-to-reconstruction error. SUnAA is also applied to Cuprite
+dataset and the results are compared visually with the available geological map
+provided for this dataset. The qualitative assessment demonstrates the
+successful estimation of the minerals abundances and significantly improves the
+detection of dominant minerals compared to the conventional regression-based
+sparse unmixing methods. The Python implementation of SUnAA can be found at:
+https://github.com/BehnoodRasti/SUnAA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Objects do not disappear: Video object detection by single-frame object
+  location anticipation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Liu, Fatemeh Karimi Nejadasl, Jan C. van Gemert, Olaf Booij, Silvia L. Pintea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objects in videos are typically characterized by continuous smooth motion. We
+exploit continuous smooth motion in three ways. 1) Improved accuracy by using
+object motion as an additional source of supervision, which we obtain by
+anticipating object locations from a static keyframe. 2) Improved efficiency by
+only doing the expensive feature computations on a small subset of all frames.
+Because neighboring video frames are often redundant, we only compute features
+for a single static keyframe and predict object locations in subsequent frames.
+3) Reduced annotation cost, where we only annotate the keyframe and use smooth
+pseudo-motion between keyframes. We demonstrate computational efficiency,
+annotation efficiency, and improved mean average precision compared to the
+state-of-the-art on four datasets: ImageNet VID, EPIC KITCHENS-55,
+YouTube-BoundingBoxes, and Waymo Open dataset. Our source code is available at
+https://github.com/L-KID/Videoobject-detection-by-location-anticipation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Induction Network: Audio-Visual Modality Gap-Bridging for
+  <span class="highlight-title">Self-Supervised</span> Sound Source Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Liu, Peng Zhang, Wei Huang, Yufei Zha, Tao You, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised sound source localization is usually challenged by the
+modality inconsistency. In recent studies, contrastive learning based
+strategies have shown promising to establish such a consistent correspondence
+between audio and sound sources in visual scenarios. Unfortunately, the
+insufficient attention to the heterogeneity influence in the different modality
+features still limits this scheme to be further improved, which also becomes
+the motivation of our work. In this study, an Induction Network is proposed to
+bridge the modality gap more effectively. By decoupling the gradients of visual
+and audio modalities, the discriminative visual representations of sound
+sources can be learned with the designed Induction Vector in a bootstrap
+manner, which also enables the audio modality to be aligned with the visual
+modality consistently. In addition to a visual weighted contrastive loss, an
+adaptive threshold selection strategy is introduced to enhance the robustness
+of the Induction Network. Substantial experiments conducted on SoundNet-Flickr
+and VGG-Sound Source datasets have demonstrated a superior performance compared
+to other state-of-the-art works in different challenging scenarios. The code is
+available at https://github.com/Tahy1/AVIN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceSkin: A Privacy Preserving Facial skin patch <span class="highlight-title">Dataset</span> for multi
+  Attributes classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiushi Guo, Shisha Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human facial skin images contain abundant textural information that can serve
+as valuable features for attribute classification, such as age, race, and
+gender. Additionally, facial skin images offer the advantages of easy
+collection and minimal privacy concerns. However, the availability of
+well-labeled human skin datasets with a sufficient number of images is limited.
+To address this issue, we introduce a dataset called FaceSkin, which
+encompasses a diverse range of ages and races. Furthermore, to broaden the
+application scenarios, we incorporate synthetic skin-patches obtained from 2D
+and 3D attack images, including printed paper, replays, and 3D masks. We
+evaluate the FaceSkin dataset across distinct categories and present
+experimental results demonstrating its effectiveness in attribute
+classification, as well as its potential for various downstream tasks, such as
+Face anti-spoofing and Age estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bird's-Eye-View Scene Graph for Vision-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Liu, Xiaohan Wang, Wenguan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language navigation (VLN), which entails an agent to navigate 3D
+environments following human instructions, has shown great advances. However,
+current agents are built upon panoramic observations, which hinders their
+ability to perceive 3D scene geometry and easily leads to ambiguous selection
+of panoramic view. To address these limitations, we present a BEV Scene Graph
+(BSG), which leverages multi-step BEV representations to encode scene layouts
+and geometric cues of indoor environment under the supervision of 3D detection.
+During navigation, BSG builds a local BEV representation at each step and
+maintains a BEV-based global scene map, which stores and organizes all the
+online collected local BEV representations according to their topological
+relations. Based on BSG, the agent predicts a local BEV grid-level decision
+score and a global graph-level decision score, combined with a sub-view
+selection score on panoramic views, for more accurate action prediction. Our
+approach significantly outperforms state-of-the-art methods on REVERIE, R2R,
+and R4R, showing the potential of BEV perception in VLN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023; Project page:
+  https://github.com/DefaultRui/BEV-Scene-Graph</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAfER: Layer-Level Sensitivity Assessment for Efficient and Robust
+  Neural Network Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edouard Yvinec, Arnaud Dapogny, Kevin Bailly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) demonstrate outstanding performance across most
+computer vision tasks. Some critical applications, such as autonomous driving
+or medical imaging, also require investigation into their behavior and the
+reasons behind the decisions they make. In this vein, DNN attribution consists
+in studying the relationship between the predictions of a DNN and its inputs.
+Attribution methods have been adapted to highlight the most relevant weights or
+neurons in a DNN, allowing to more efficiently select which weights or neurons
+can be pruned. However, a limitation of these approaches is that weights are
+typically compared within each layer separately, while some layers might appear
+as more critical than others. In this work, we propose to investigate DNN layer
+importance, i.e. to estimate the sensitivity of the accuracy w.r.t.
+perturbations applied at the layer level. To do so, we propose a novel dataset
+to evaluate our method as well as future works. We benchmark a number of
+criteria and draw conclusions regarding how to assess DNN layer importance and,
+consequently, how to budgetize layers for increased DNN efficiency (with
+applications for DNN pruning and quantization), as well as robustness to
+hardware failure (e.g. bit swaps).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TextPainter: Multimodal Text Image Generation withVisual-harmony and
+  Text-comprehension for Poster Design <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Gao, Jinpeng Lin, Min Zhou, Chuanbin Liu, Hongtao Xie, Tiezheng Ge, Yuning Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text design is one of the most critical procedures in poster design, as it
+relies heavily on the creativity and expertise of humans to design text images
+considering the visual harmony and text-semantic. This study introduces
+TextPainter, a novel multimodal approach that leverages contextual visual
+information and corresponding text semantics to generate text images.
+Specifically, TextPainter takes the global-local background image as a hint of
+style and guides the text image generation with visual harmony. Furthermore, we
+leverage the language model and introduce a text comprehension module to
+achieve both sentence-level and word-level style variations. Besides, we
+construct the PosterT80K dataset, consisting of about 80K posters annotated
+with sentence-level bounding boxes and text contents. We hope this dataset will
+pave the way for further research on multimodal text image generation.
+Extensive quantitative and qualitative experiments demonstrate that TextPainter
+can generatevisually-and-semantically-harmonious text images for posters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Learning of Rotation-invariant 3D Point Set Features
+  using <span class="highlight-title">Transformer</span> and its Self-distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takahiko Furuya, Zhoujie Chen, Ryutarou Ohbuchi, Zhenzhong Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Invariance against rotations of 3D objects is an important property in
+analyzing 3D point set data. Conventional 3D point set DNNs having rotation
+invariance typically obtain accurate 3D shape features via supervised learning
+by using labeled 3D point sets as training samples. However, due to the rapid
+increase in 3D point set data and the high cost of labeling, a framework to
+learn rotation-invariant 3D shape features from numerous unlabeled 3D point
+sets is required. This paper proposes a novel self-supervised learning
+framework for acquiring accurate and rotation-invariant 3D point set features
+at object-level. Our proposed lightweight DNN architecture decomposes an input
+3D point set into multiple global-scale regions, called tokens, that preserve
+the spatial layout of partial shapes composing the 3D object. We employ a
+self-attention mechanism to refine the tokens and aggregate them into an
+expressive rotation-invariant feature per 3D point set. Our DNN is effectively
+trained by using pseudo-labels generated by a self-distillation framework. To
+facilitate the learning of accurate features, we propose to combine multi-crop
+and cut-mix data augmentation techniques to diversify 3D point sets for
+training. Through a comprehensive evaluation, we empirically demonstrate that,
+(1) existing rotation-invariant DNN architectures designed for supervised
+learning do not necessarily learn accurate 3D shape features under a
+self-supervised learning scenario, and (2) our proposed algorithm learns
+rotation-invariant 3D point set features that are more accurate than those
+learned by existing algorithms. Code will be available at
+https://github.com/takahikof/RIPT_SDMM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric
+  Multi-Modal Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Barbato, Elena Camuffo, Simone Milani, Pietro Zanuttigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art multimodal semantic segmentation approaches combining LiDAR
+and color data are usually designed on top of asymmetric information-sharing
+schemes and assume that both modalities are always available. Regrettably, this
+strong assumption may not hold in real-world scenarios, where sensors are prone
+to failure or can face adverse conditions (night-time, rain, fog, etc.) that
+make the acquired information unreliable. Moreover, these architectures tend to
+fail in continual learning scenarios. In this work, we re-frame the task of
+multimodal semantic segmentation by enforcing a tightly-coupled feature
+representation and a symmetric information-sharing scheme, which allows our
+approach to work even when one of the input modalities is missing. This makes
+our model reliable even in safety-critical settings, as is the case of
+autonomous driving. We evaluate our approach on the SemanticKITTI dataset,
+comparing it with our closest competitor. We also introduce an ad-hoc continual
+learning scheme and show results in a class-incremental continual learning
+scenario that prove the effectiveness of the approach also in this setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 2 tables, 8 equations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GIFD: A Generative Gradient Inversion Method with Feature Domain
+  Optimization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Fang, Bin Chen, Xuan Wang, Zhi Wang, Shu-Tao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has recently emerged as a promising distributed
+machine learning framework to preserve clients' privacy, by allowing multiple
+clients to upload the gradients calculated from their local data to a central
+server. Recent studies find that the exchanged gradients also take the risk of
+privacy leakage, e.g., an attacker can invert the shared gradients and recover
+sensitive data against an FL system by leveraging pre-trained generative
+adversarial networks (GAN) as prior knowledge. However, performing gradient
+inversion attacks in the latent space of the GAN model limits their expression
+ability and generalizability. To tackle these challenges, we propose
+\textbf{G}radient \textbf{I}nversion over \textbf{F}eature \textbf{D}omains
+(GIFD), which disassembles the GAN model and searches the feature domains of
+the intermediate layers. Instead of optimizing only over the initial latent
+code, we progressively change the optimized layer, from the initial latent
+space to intermediate layers closer to the output images. In addition, we
+design a regularizer to avoid unreal image generation by adding a small ${l_1}$
+ball constraint to the searching range. We also extend GIFD to the
+out-of-distribution (OOD) setting, which weakens the assumption that the
+training sets of GANs and FL tasks obey the same data distribution. Extensive
+experiments demonstrate that our method can achieve pixel-level reconstruction
+and is superior to the existing methods. Notably, GIFD also shows great
+generalizability under different defense strategy settings and batch sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid Training Data Creation by Synthesizing Medical Images for
+  Classification and Localization <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kushwaha, Sarthak Gupta, Anish Bhanushali, Tathagato Rai Dastidar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the use of artificial intelligence (AI) for medical image analysis is
+gaining wide acceptance, the expertise, time and cost required to generate
+annotated data in the medical field are significantly high, due to limited
+availability of both data and expert annotation. Strongly supervised object
+localization models require data that is exhaustively annotated, meaning all
+objects of interest in an image are identified. This is difficult to achieve
+and verify for medical images. We present a method for the transformation of
+real data to train any Deep Neural Network to solve the above problems. We show
+the efficacy of this approach on both a weakly supervised localization model
+and a strongly supervised localization model. For the weakly supervised model,
+we show that the localization accuracy increases significantly using the
+generated data. For the strongly supervised model, this approach overcomes the
+need for exhaustive annotation on real images. In the latter model, we show
+that the accuracy, when trained with generated images, closely parallels the
+accuracy when trained with exhaustively annotated real images. The results are
+demonstrated on images of human urine samples obtained using microscopy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://openaccess.thecvf.com/content_CVPRW_2020/html/w57/Kushwaha_Rapid_Training_Data_Creation_by_Synthesizing_Medical_Images_for_Classification_CVPRW_2020_paper.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score Priors Guided Deep Variational Inference for Unsupervised
+  Real-World Single Image Denoising <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Cheng, Tao Liu, Shan Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world single image denoising is crucial and practical in computer
+vision. Bayesian inversions combined with score priors now have proven
+effective for single image denoising but are limited to white Gaussian noise.
+Moreover, applying existing score-based methods for real-world denoising
+requires not only the explicit train of score priors on the target domain but
+also the careful design of sampling procedures for posterior inference, which
+is complicated and impractical. To address these limitations, we propose a
+score priors-guided deep variational inference, namely ScoreDVI, for practical
+real-world denoising. By considering the deep variational image posterior with
+a Gaussian form, score priors are extracted based on easily accessible minimum
+MSE Non-$i.i.d$ Gaussian denoisers and variational samples, which in turn
+facilitate optimizing the variational image posterior. Such a procedure
+adaptively applies cheap score priors to denoising. Additionally, we exploit a
+Non-$i.i.d$ Gaussian mixture model and variational noise posterior to model the
+real-world noise. This scheme also enables the pixel-wise fusion of multiple
+image priors and variational image posteriors. Besides, we develop a
+noise-aware prior assignment strategy that dynamically adjusts the weight of
+image priors in the optimization. Our method outperforms other single
+image-based real-world denoising methods and achieves comparable performance to
+dataset-based unsupervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Racial Bias in Facial Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Fan, Xingshuo Xiao, Peter Washington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness in deep learning models trained with high-dimensional inputs and
+subjective labels remains a complex and understudied area. Facial emotion
+recognition, a domain where datasets are often racially imbalanced, can lead to
+models that yield disparate outcomes across racial groups. This study focuses
+on analyzing racial bias by sub-sampling training sets with varied racial
+distributions and assessing test performance across these simulations. Our
+findings indicate that smaller datasets with posed faces improve on both
+fairness and performance metrics as the simulations approach racial balance.
+Notably, the F1-score increases by $27.2\%$ points, and demographic parity
+increases by $15.7\%$ points on average across the simulations. However, in
+larger datasets with greater facial variation, fairness metrics generally
+remain constant, suggesting that racial balance by itself is insufficient to
+achieve parity in test performance across different racial groups.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resource Constrained Model Compression via Minimax Optimization for
+  Spiking Neural Networks <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jue Chen, Huan Yuan, Jianchao Tan, Bin Chen, Chengru Song, Di Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain-inspired Spiking Neural Networks (SNNs) have the characteristics of
+event-driven and high energy-efficient, which are different from traditional
+Artificial Neural Networks (ANNs) when deployed on edge devices such as
+neuromorphic chips. Most previous work focuses on SNNs training strategies to
+improve model performance and brings larger and deeper network architectures.
+It is difficult to deploy these complex networks on resource-limited edge
+devices directly. To meet such demand, people compress SNNs very cautiously to
+balance the performance and the computation efficiency. Existing compression
+methods either iteratively pruned SNNs using weights norm magnitude or
+formulated the problem as a sparse learning optimization. We propose an
+improved end-to-end Minimax optimization method for this sparse learning
+problem to better balance the model performance and the computation efficiency.
+We also demonstrate that jointly applying compression and finetuning on SNNs is
+better than sequentially, especially for extreme compression ratios. The
+compressed SNN models achieved state-of-the-art (SOTA) performance on various
+benchmark datasets and architectures. Our code is available at
+https://github.com/chenjallen/Resource-Constrained-Compression-on-SNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Implicit Framework for Fast NeRF Composition and Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Gao, Ziyi Yang, Yunlu Zhao, Yuxiang Sun, Xiaogang Jin, Changqing Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a variety of Neural radiance fields methods have garnered
+remarkable success in high render speed. However, current accelerating methods
+is specialized and not compatible for various implicit method, which prevent a
+real-time composition over different kinds of NeRF works. Since NeRF relies on
+sampling along rays, it's possible to provide a guidance generally. We propose
+a general implicit pipeline to rapidly compose NeRF objects. This new method
+enables the casting of dynamic shadows within or between objects using
+analytical light sources while allowing multiple NeRF objects to be seamlessly
+placed and rendered together with any arbitrary rigid transformations. Mainly,
+our work introduces a new surface representation known as Neural Depth Fields
+(NeDF) that quickly determines the spatial relationship between objects by
+allowing direct intersection computation between rays and implicit surfaces. It
+leverages an intersection neural network to query NeRF for acceleration instead
+of depending on an explicit spatial structure.Our proposed method is the first
+to enable both the progressive and interactive composition of NeRF objects.
+Additionally, it also serves as a previewing plugin for a range of existing
+NeRF works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages for main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of lung cancer subtypes on CT images with synthetic
+  pathological priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Yuan Jin, Gege Ma, Geng Chen, Jan Egger, Shaoting Zhang, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate diagnosis on pathological subtypes for lung cancer is of
+significant importance for the follow-up treatments and prognosis managements.
+In this paper, we propose self-generating hybrid feature network (SGHF-Net) for
+accurately classifying lung cancer subtypes on computed tomography (CT) images.
+Inspired by studies stating that cross-scale associations exist in the image
+patterns between the same case's CT images and its pathological images, we
+innovatively developed a pathological feature synthetic module (PFSM), which
+quantitatively maps cross-modality associations through deep neural networks,
+to derive the "gold standard" information contained in the corresponding
+pathological images from CT images. Additionally, we designed a radiological
+feature extraction module (RFEM) to directly acquire CT image information and
+integrated it with the pathological priors under an effective feature fusion
+framework, enabling the entire classification model to generate more indicative
+and specific pathologically related features and eventually output more
+accurate predictions. The superiority of the proposed model lies in its ability
+to self-generate hybrid features that contain multi-modality image information
+based on a single-modality input. To evaluate the effectiveness, adaptability,
+and generalization ability of our model, we performed extensive experiments on
+a large-scale multi-center dataset (i.e., 829 cases from three hospitals) to
+compare our model and a series of state-of-the-art (SOTA) classification
+models. The experimental results demonstrated the superiority of our model for
+lung cancer subtypes classification with significant accuracy improvements in
+terms of accuracy (ACC), area under the curve (AUC), and F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which Tokens to Use? Investigating Token Reduction in Vision
+  <span class="highlight-title">Transformer</span>s <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joakim Bruslund Haurum, Sergio Escalera, Graham W. Taylor, Thomas B. Moeslund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the introduction of the Vision Transformer (ViT), researchers have
+sought to make ViTs more efficient by removing redundant information in the
+processed tokens. While different methods have been explored to achieve this
+goal, we still lack understanding of the resulting reduction patterns and how
+those patterns differ across token reduction methods and datasets. To close
+this gap, we set out to understand the reduction patterns of 10 different token
+reduction methods using four image classification datasets. By systematically
+comparing these methods on the different classification tasks, we find that the
+Top-K pruning method is a surprisingly strong baseline. Through in-depth
+analysis of the different methods, we determine that: the reduction patterns
+are generally not consistent when varying the capacity of the backbone model,
+the reduction patterns of pruning-based methods significantly differ from fixed
+radial patterns, and the reduction patterns of pruning-based methods are
+correlated across classification datasets. Finally we report that the
+similarity of reduction patterns is a moderate-to-strong proxy for model
+performance. Project page at https://vap.aau.dk/tokens.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 NIVT Workshop. Project webpage https://vap.aau.dk/tokens</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing the performance of deep learning-based models for prostate
+  cancer segmentation using uncertainty scores <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Cesar Quihui-Rubio, Daniel Flores-Araiza, Gilberto Ochoa-Ruiz, Miguel Gonzalez-Mendoza, Christian Mata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study focuses on comparing deep learning methods for the segmentation
+and quantification of uncertainty in prostate segmentation from MRI images. The
+aim is to improve the workflow of prostate cancer detection and diagnosis.
+Seven different U-Net-based architectures, augmented with Monte-Carlo dropout,
+are evaluated for automatic segmentation of the central zone, peripheral zone,
+transition zone, and tumor, with uncertainty estimation. The top-performing
+model in this study is the Attention R2U-Net, achieving a mean Intersection
+over Union (IoU) of 76.3% and Dice Similarity Coefficient (DSC) of 85% for
+segmenting all zones. Additionally, Attention R2U-Net exhibits the lowest
+uncertainty values, particularly in the boundaries of the transition zone and
+tumor, when compared to the other models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Article accepted at Cancer Prevention through early detecTion
+  (CaPtTion) workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Long-Distance Gesture Recognition using Dynamic Neural Networks <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubhang Bhatnagar, Sharath Gopal, Narendra Ahuja, Liu Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gestures form an important medium of communication between humans and
+machines. An overwhelming majority of existing gesture recognition methods are
+tailored to a scenario where humans and machines are located very close to each
+other. This short-distance assumption does not hold true for several types of
+interactions, for example gesture-based interactions with a floor cleaning
+robot or with a drone. Methods made for short-distance recognition are unable
+to perform well on long-distance recognition due to gestures occupying only a
+small portion of the input data. Their performance is especially worse in
+resource constrained settings where they are not able to effectively focus
+their limited compute on the gesturing subject. We propose a novel, accurate
+and efficient method for the recognition of gestures from longer distances. It
+uses a dynamic neural network to select features from gesture-containing
+spatial regions of the input sensor data for further processing. This helps the
+network focus on features important for gesture recognition while discarding
+background features early on, thus making it more compute efficient compared to
+other techniques. We demonstrate the performance of our method on the LD-ConGR
+long-distance dataset where it outperforms previous state-of-the-art methods on
+recognition accuracy and compute efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE/RSJ International Conference on Intelligent Robots
+  and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeoAdapt: <span class="highlight-title">Self-Supervised</span> Test-Time Adaption in LiDAR Place Recognition
+  Using Geometric Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Knights, Stephen Hausler, Sridha Sridharan, Clinton Fookes, Peyman Moghadam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR place recognition approaches based on deep learning suffer a
+significant degradation in performance when there is a shift between the
+distribution of the training and testing datasets, with re-training often
+required to achieve top performance. However, obtaining accurate ground truth
+on new environments can be prohibitively expensive, especially in complex or
+GPS-deprived environments. To address this issue we propose GeoAdapt, which
+introduces a novel auxiliary classification head to generate pseudo-labels for
+re-training on unseen environments in a self-supervised manner. GeoAdapt uses
+geometric consistency as a prior to improve the robustness of our generated
+pseudo-labels against domain shift, improving the performance and reliability
+of our Test-Time Adaptation approach. Comprehensive experiments show that
+GeoAdapt significantly boosts place recognition performance across moderate to
+severe domain shifts, and is competitive with fully supervised test-time
+adaptation approaches. Our code will be available at
+https://github.com/csiro-robotics/GeoAdapt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to RA-L</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Early Detection of Virus Yellows: Developing a Hybrid
+  Convolutional Neural Network for Automatic Aphid Counting in Sugar Beet
+  Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xumin Gao, Wenxin Xue, Callum Lennox, Mark Stevens, Junfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aphids are efficient vectors to transmit virus yellows in sugar beet fields.
+Timely monitoring and control of their populations are thus critical to prevent
+the large-scale outbreak of virus yellows. However, the manual counting of
+aphids, which is the most common practice, is labor-intensive and
+time-consuming. Additionally, two of the biggest challenges in aphid counting
+are that aphids are small objects and their density distributions are varied in
+different areas of the field. To address these challenges, we proposed a hybrid
+automatic aphid counting network architecture which integrates the detection
+network and the density map estimation network. When the distribution density
+of aphids is low, it utilizes an improved Yolov5 to count aphids. Conversely,
+when the distribution density of aphids is high, its witches to CSRNet to count
+aphids. To the best of our knowledge, this is the first framework integrating
+the detection network and the density map estimation network for counting
+tasks. Through comparison experiments of counting aphids, it verified that our
+proposed approach outperforms all other methods in counting aphids. It achieved
+the lowest MAE and RMSE values for both the standard and high-density aphid
+datasets: 2.93 and 4.01 (standard), and 34.19 and 38.66 (high-density),
+respectively. Moreover, the AP of the improved Yolov5 is 5% higher than that of
+the original Yolov5. Especially for extremely small aphids and densely
+distributed aphids, the detection performance of the improved Yolov5 is
+significantly better than the original Yolov5. This work provides an effective
+early warning for the virus yellows risk caused by aphids in sugar beet fields,
+offering protection for sugar beet growth and ensuring sugar beet yield. The
+datasets and project code are released at:
+https://github.com/JunfengGaolab/Counting-Aphids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vector quantization loss analysis in VQGANs: a single-GPU ablation study
+  for image-to-image synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luv Verma, Varun Mohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study performs an ablation analysis of Vector Quantized Generative
+Adversarial Networks (VQGANs), concentrating on image-to-image synthesis
+utilizing a single NVIDIA A100 GPU. The current work explores the nuanced
+effects of varying critical parameters including the number of epochs, image
+count, and attributes of codebook vectors and latent dimensions, specifically
+within the constraint of limited resources. Notably, our focus is pinpointed on
+the vector quantization loss, keeping other hyperparameters and loss components
+(GAN loss) fixed. This was done to delve into a deeper understanding of the
+discrete latent space, and to explore how varying its size affects the
+reconstruction. Though, our results do not surpass the existing benchmarks,
+however, our findings shed significant light on VQGAN's behaviour for a smaller
+dataset, particularly concerning artifacts, codebook size optimization, and
+comparative analysis with Principal Component Analysis (PCA). The study also
+uncovers the promising direction by introducing 2D positional encodings,
+revealing a marked reduction in artifacts and insights into balancing clarity
+and overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial Gated Multi-Layer Perceptron for Land Use and Land Cover Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Jamali, Swalpa Kumar Roy, Danfeng Hong, Peter M Atkinson, Pedram Ghamisi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) are models that are utilized extensively
+for the hierarchical extraction of features. Vision transformers (ViTs),
+through the use of a self-attention mechanism, have recently achieved superior
+modeling of global contextual information compared to CNNs. However, to realize
+their image classification strength, ViTs require substantial training
+datasets. Where the available training data are limited, current advanced
+multi-layer perceptrons (MLPs) can provide viable alternatives to both deep
+CNNs and ViTs. In this paper, we developed the SGU-MLP, a learning algorithm
+that effectively uses both MLPs and spatial gating units (SGUs) for precise
+land use land cover (LULC) mapping. Results illustrated the superiority of the
+developed SGU-MLP classification algorithm over several CNN and CNN-ViT-based
+models, including HybridSN, ResNet, iFormer, EfficientFormer and CoAtNet. The
+proposed SGU-MLP algorithm was tested through three experiments in Houston,
+USA, Berlin, Germany and Augsburg, Germany. The SGU-MLP classification model
+was found to consistently outperform the benchmark CNN and CNN-ViT-based
+algorithms. For example, for the Houston experiment, SGU-MLP significantly
+outperformed HybridSN, CoAtNet, Efficientformer, iFormer and ResNet by
+approximately 15%, 19%, 20%, 21%, and 25%, respectively, in terms of average
+accuracy. The code will be made publicly available at
+https://github.com/aj1365/SGUMLP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted in IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging the Edge and Cloud for V2X-Based Real-Time Object Detection
+  in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Hawlader, François Robinet, Raphaël Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Environmental perception is a key element of autonomous driving because the
+information received from the perception module influences core driving
+decisions. An outstanding challenge in real-time perception for autonomous
+driving lies in finding the best trade-off between detection quality and
+latency. Major constraints on both computation and power have to be taken into
+account for real-time perception in autonomous vehicles. Larger object
+detection models tend to produce the best results, but are also slower at
+runtime. Since the most accurate detectors cannot run in real-time locally, we
+investigate the possibility of offloading computation to edge and cloud
+platforms, which are less resource-constrained. We create a synthetic dataset
+to train object detection models and evaluate different offloading strategies.
+Using real hardware and network simulations, we compare different trade-offs
+between prediction quality and end-to-end delay. Since sending raw frames over
+the network implies additional transmission delays, we also explore the use of
+JPEG and H.265 compression at varying qualities and measure their impact on
+prediction metrics. We show that models with adequate compression can be run in
+real-time on the cloud while outperforming local detection performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegMatch: A semi-supervised learning method for surgical instrument
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wei, Charlie Budd, Luis C. Garcia-Peraza-Herrera, Reuben Dorent, Miaojing Shi, Tom Vercauteren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical instrument segmentation is recognised as a key enabler to provide
+advanced surgical assistance and improve computer assisted interventions. In
+this work, we propose SegMatch, a semi supervised learning method to reduce the
+need for expensive annotation for laparoscopic and robotic surgical images.
+SegMatch builds on FixMatch, a widespread semi supervised classification
+pipeline combining consistency regularization and pseudo labelling, and adapts
+it for the purpose of segmentation. In our proposed SegMatch, the unlabelled
+images are weakly augmented and fed into the segmentation model to generate a
+pseudo-label to enforce the unsupervised loss against the output of the model
+for the adversarial augmented image on the pixels with a high confidence score.
+Our adaptation for segmentation tasks includes carefully considering the
+equivariance and invariance properties of the augmentation functions we rely
+on. To increase the relevance of our augmentations, we depart from using only
+handcrafted augmentations and introduce a trainable adversarial augmentation
+strategy. Our algorithm was evaluated on the MICCAI Instrument Segmentation
+Challenge datasets Robust-MIS 2019 and EndoVis 2017. Our results demonstrate
+that adding unlabelled data for training purposes allows us to surpass the
+performance of fully supervised approaches which are limited by the
+availability of training data in these challenges. SegMatch also outperforms a
+range of state-of-the-art semi-supervised learning semantic segmentation models
+in different labelled to unlabelled data ratios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint under review, 12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Representations for Spatio-Temporal Visual Attention
+  Modeling and Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel-Ángel Fernández-Torres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This PhD. Thesis concerns the study and development of hierarchical
+representations for spatio-temporal visual attention modeling and understanding
+in video sequences. More specifically, we propose two computational models for
+visual attention. First, we present a generative probabilistic model for
+context-aware visual attention modeling and understanding. Secondly, we develop
+a deep network architecture for visual attention modeling, which first
+estimates top-down spatio-temporal visual attention, and ultimately serves for
+modeling attention in the temporal domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Interactive Model Evaluation for Classification, Object
+  Detection, and Instance Segmentation in Computer Vision <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changjian Chen, Yukai Guo, Fengyuan Tian, Shilong Liu, Weikai Yang, Zhaowei Wang, Jing Wu, Hang Su, Hanspeter Pfister, Shixia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing model evaluation tools mainly focus on evaluating classification
+models, leaving a gap in evaluating more complex models, such as object
+detection. In this paper, we develop an open-source visual analysis tool,
+Uni-Evaluator, to support a unified model evaluation for classification, object
+detection, and instance segmentation in computer vision. The key idea behind
+our method is to formulate both discrete and continuous predictions in
+different tasks as unified probability distributions. Based on these
+distributions, we develop 1) a matrix-based visualization to provide an
+overview of model performance; 2) a table visualization to identify the
+problematic data subsets where the model performs poorly; 3) a grid
+visualization to display the samples of interest. These visualizations work
+together to facilitate the model evaluation from a global overview to
+individual samples. Two case studies demonstrate the effectiveness of
+Uni-Evaluator in evaluating model performance and making informed improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Morphological Identification of Extended Radio
+  Galaxies using Weak Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhel Gupta, Zeeshan Hayder, Ray P. Norris, Minh Huynh, Lars Petersson, X. Rosalind Wang, Heinz Andernach, Bärbel S. Koribalski, Miranda Yew, Evan J. Crawford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present work discusses the use of a weakly-supervised deep learning
+algorithm that reduces the cost of labelling pixel-level masks for complex
+radio galaxies with multiple components. The algorithm is trained on weak
+class-level labels of radio galaxies to get class activation maps (CAMs). The
+CAMs are further refined using an inter-pixel relations network (IRNet) to get
+instance segmentation masks over radio galaxies and the positions of their
+infrared hosts. We use data from the Australian Square Kilometre Array
+Pathfinder (ASKAP) telescope, specifically the Evolutionary Map of the Universe
+(EMU) Pilot Survey, which covered a sky area of 270 square degrees with an RMS
+sensitivity of 25-35 $\mu$Jy/beam. We demonstrate that weakly-supervised deep
+learning algorithms can achieve high accuracy in predicting pixel-level
+information, including masks for the extended radio emission encapsulating all
+galaxy components and the positions of the infrared host galaxies. We evaluate
+the performance of our method using mean Average Precision (mAP) across
+multiple classes at a standard intersection over union (IoU) threshold of 0.5.
+We show that the model achieves a mAP$_{50}$ of 67.5\% and 76.8\% for radio
+masks and infrared host positions, respectively. The network architecture can
+be found at the following link: https://github.com/Nikhel1/Gal-CAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figues, accepted for publication in PASA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Object Modeling for Visual Tracking <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidong Cai, Jie Liu, Jie Tang, Gangshan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object modeling has become a core part of recent tracking frameworks. Current
+popular tackers use Transformer attention to extract the template feature
+separately or interactively with the search region. However, separate template
+learning lacks communication between the template and search regions, which
+brings difficulty in extracting discriminative target-oriented features. On the
+other hand, interactive template learning produces hybrid template features,
+which may introduce potential distractors to the template via the cluttered
+search regions. To enjoy the merits of both methods, we propose a robust object
+modeling framework for visual tracking (ROMTrack), which simultaneously models
+the inherent template and the hybrid template features. As a result, harmful
+distractors can be suppressed by combining the inherent features of target
+objects with search regions' guidance. Target-related features can also be
+extracted using the hybrid template, thus resulting in a more robust object
+modeling framework. To further enhance robustness, we present novel variation
+tokens to depict the ever-changing appearance of target objects. Variation
+tokens are adaptable to object deformation and appearance variations, which can
+boost overall performance with negligible computation. Experiments show that
+our ROMTrack sets a new state-of-the-art on multiple benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. 19 pages. Code is available at
+  https://github.com/dawnyc/ROMTrack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discrepancy-based Active Learning for Weakly Supervised Bleeding
+  Segmentation in Wireless Capsule Endoscopy Images <span class="chip">MICCAI 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Bai, Xiaohan Xing, Yutian Shen, Han Ma, Max Q. -H. Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised methods, such as class activation maps (CAM) based, have
+been applied to achieve bleeding segmentation with low annotation efforts in
+Wireless Capsule Endoscopy (WCE) images. However, the CAM labels tend to be
+extremely noisy, and there is an irreparable gap between CAM labels and ground
+truths for medical images. This paper proposes a new Discrepancy-basEd Active
+Learning (DEAL) approach to bridge the gap between CAMs and ground truths with
+a few annotations. Specifically, to liberate labor, we design a novel
+discrepancy decoder model and a CAMPUS (CAM, Pseudo-label and groUnd-truth
+Selection) criterion to replace the noisy CAMs with accurate model predictions
+and a few human labels. The discrepancy decoder model is trained with a unique
+scheme to generate standard, coarse and fine predictions. And the CAMPUS
+criterion is proposed to predict the gaps between CAMs and ground truths based
+on model divergence and CAM divergence. We evaluate our method on the WCE
+dataset and results show that our method outperforms the state-of-the-art
+active learning methods and reaches comparable performance to those trained
+with full annotated datasets with only 10% of the training data labeled.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by MICCAI 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inducing Neural Collapse to a Fixed Hierarchy-Aware Frame for Reducing
+  Mistake Severity <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Liang, Jim Davis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a recently discovered and intriguing phenomenon called Neural
+Collapse: at the terminal phase of training a deep neural network for
+classification, the within-class penultimate feature means and the associated
+classifier vectors of all flat classes collapse to the vertices of a simplex
+Equiangular Tight Frame (ETF). Recent work has tried to exploit this phenomenon
+by fixing the related classifier weights to a pre-computed ETF to induce neural
+collapse and maximize the separation of the learned features when training with
+imbalanced data. In this work, we propose to fix the linear classifier of a
+deep neural network to a Hierarchy-Aware Frame (HAFrame), instead of an ETF,
+and use a cosine similarity-based auxiliary loss to learn hierarchy-aware
+penultimate features that collapse to the HAFrame. We demonstrate that our
+approach reduces the mistake severity of the model's predictions while
+maintaining its top-1 accuracy on several datasets of varying scales with
+hierarchies of heights ranging from 3 to 12. Code:
+https://github.com/ltong1130ztr/HAFrame
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SketchANIMAR: Sketch-based 3D Animal Fine-Grained Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05731v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05731v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung-Nghia Le, Tam V. Nguyen, Minh-Quan Le, Trong-Thuan Nguyen, Viet-Tham Huynh, Trong-Le Do, Khanh-Duy Le, Mai-Khiem Tran, Nhat Hoang-Xuan, Thang-Long Nguyen-Ho, Vinh-Tiep Nguyen, Nhat-Quynh Le-Pham, Huu-Phuc Pham, Trong-Vu Hoang, Quang-Binh Nguyen, Trong-Hieu Nguyen-Mau, Tuan-Luc Huynh, Thanh-Danh Le, Ngoc-Linh Nguyen-Ha, Tuong-Vy Truong-Thuy, Truong Hoai Phong, Tuong-Nghiem Diep, Khanh-Duy Ho, Xuan-Hieu Nguyen, Thien-Phuc Tran, Tuan-Anh Yang, Kim-Phat Tran, Nhu-Vinh Hoang, Minh-Quang Nguyen, Hoai-Danh Vo, Minh-Hoa Doan, Hai-Dang Nguyen, Akihiro Sugimoto, Minh-Triet Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The retrieval of 3D objects has gained significant importance in recent years
+due to its broad range of applications in computer vision, computer graphics,
+virtual reality, and augmented reality. However, the retrieval of 3D objects
+presents significant challenges due to the intricate nature of 3D models, which
+can vary in shape, size, and texture, and have numerous polygons and vertices.
+To this end, we introduce a novel SHREC challenge track that focuses on
+retrieving relevant 3D animal models from a dataset using sketch queries and
+expedites accessing 3D models through available sketches. Furthermore, a new
+dataset named ANIMAR was constructed in this study, comprising a collection of
+711 unique 3D animal models and 140 corresponding sketch queries. Our contest
+requires participants to retrieve 3D models based on complex and detailed
+sketches. We receive satisfactory results from eight teams and 204 runs.
+Although further improvement is necessary, the proposed task has the potential
+to incentivize additional research in the domain of 3D object retrieval,
+potentially yielding benefits for a wide range of applications. We also provide
+insights into potential areas of future research, such as improving techniques
+for feature extraction and matching and creating more diverse datasets to
+evaluate retrieval performance. https://aichallenge.hcmus.edu.vn/sketchanimar
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Computers & Graphics (3DOR 2023, Journal track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do Perceptually Aligned Gradients Imply Adversarial Robustness? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roy Ganz, Bahjat Kawar, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarially robust classifiers possess a trait that non-robust models do
+not -- Perceptually Aligned Gradients (PAG). Their gradients with respect to
+the input align well with human perception. Several works have identified PAG
+as a byproduct of robust training, but none have considered it as a standalone
+phenomenon nor studied its own implications. In this work, we focus on this
+trait and test whether \emph{Perceptually Aligned Gradients imply Robustness}.
+To this end, we develop a novel objective to directly promote PAG in training
+classifiers and examine whether models with such gradients are more robust to
+adversarial attacks. Extensive experiments on multiple datasets and
+architectures validate that models with aligned gradients exhibit significant
+robustness, exposing the surprising bidirectional connection between PAG and
+robustness. Lastly, we show that better gradient alignment leads to increased
+robustness and harness this observation to boost the robustness of existing
+adversarial training techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextANIMAR: Text-based 3D Animal Fine-Grained Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung-Nghia Le, Tam V. Nguyen, Minh-Quan Le, Trong-Thuan Nguyen, Viet-Tham Huynh, Trong-Le Do, Khanh-Duy Le, Mai-Khiem Tran, Nhat Hoang-Xuan, Thang-Long Nguyen-Ho, Vinh-Tiep Nguyen, Tuong-Nghiem Diep, Khanh-Duy Ho, Xuan-Hieu Nguyen, Thien-Phuc Tran, Tuan-Anh Yang, Kim-Phat Tran, Nhu-Vinh Hoang, Minh-Quang Nguyen, E-Ro Nguyen, Minh-Khoi Nguyen-Nhat, Tuan-An To, Trung-Truc Huynh-Le, Nham-Tan Nguyen, Hoang-Chau Luong, Truong Hoai Phong, Nhat-Quynh Le-Pham, Huu-Phuc Pham, Trong-Vu Hoang, Quang-Binh Nguyen, Hai-Dang Nguyen, Akihiro Sugimoto, Minh-Triet Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D object retrieval is an important yet challenging task that has drawn more
+and more attention in recent years. While existing approaches have made strides
+in addressing this issue, they are often limited to restricted settings such as
+image and sketch queries, which are often unfriendly interactions for common
+users. In order to overcome these limitations, this paper presents a novel
+SHREC challenge track focusing on text-based fine-grained retrieval of 3D
+animal models. Unlike previous SHREC challenge tracks, the proposed task is
+considerably more challenging, requiring participants to develop innovative
+approaches to tackle the problem of text-based retrieval. Despite the increased
+difficulty, we believe this task can potentially drive useful applications in
+practice and facilitate more intuitive interactions with 3D objects. Five
+groups participated in our competition, submitting a total of 114 runs. While
+the results obtained in our competition are satisfactory, we note that the
+challenges presented by this task are far from fully solved. As such, we
+provide insights into potential areas for future research and improvements. We
+believe we can help push the boundaries of 3D object retrieval and facilitate
+more user-friendly interactions via vision-language technologies.
+https://aichallenge.hcmus.edu.vn/textanimar
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Computers and Graphics (3DOR, Journal Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-Augmented Model-Based Planning for Visual Exploration <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimeng Li, Arnab Debnath, Gregory Stein, Jana Kosecka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of time-limited robotic exploration in previously
+unseen environments where exploration is limited by a predefined amount of
+time. We propose a novel exploration approach using learning-augmented
+model-based planning. We generate a set of subgoals associated with frontiers
+on the current map and derive a Bellman Equation for exploration with these
+subgoals. Visual sensing and advances in semantic mapping of indoor scenes are
+exploited for training a deep convolutional neural network to estimate
+properties associated with each frontier: the expected unobserved area beyond
+the frontier and the expected timesteps (discretized actions) required to
+explore it. The proposed model-based planner is guaranteed to explore the whole
+scene if time permits. We thoroughly evaluate our approach on a large-scale
+pseudo-realistic indoor dataset (Matterport3D) with the Habitat simulator. We
+compare our approach with classical and more recent RL-based exploration
+methods. Our approach surpasses the greedy strategies by 2.1% and the RL-based
+exploration methods by 8.4% in terms of coverage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the
+  Generalization of Histopathology Image Classification Across Unseen Hospitals <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milad Sikaroudi, Maryam Hosseini, Shahryar Rahnamayan, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an exhaustive methodology that leverages all levels of feature
+abstraction, targeting an enhancement in the generalizability of image
+classification to unobserved hospitals. Our approach incorporates
+augmentation-based self-supervision with common distribution shifts in
+histopathology scenarios serving as the pretext task. This enables us to derive
+invariant features from training images without relying on training labels,
+thereby covering different abstraction levels. Moving onto the subsequent
+abstraction level, we employ a domain alignment module to facilitate further
+extraction of invariant features across varying training hospitals. To
+represent the highly specific features of participating hospitals, an encoder
+is trained to classify hospital labels, independent of their diagnostic labels.
+The features from each of these encoders are subsequently disentangled to
+minimize redundancy and segregate the features. This representation, which
+spans a broad spectrum of semantic information, enables the development of a
+model demonstrating increased robustness to unseen images from disparate
+distributions. Experimental results from the PACS dataset (a domain
+generalization benchmark), a synthetic dataset created by applying
+histopathology-specific jitters to the MHIST dataset (defining different
+domains with varied distribution shifts), and a Renal Cell Carcinoma dataset
+derived from four image repositories from TCGA, collectively indicate that our
+proposed model is adept at managing varying levels of image granularity. Thus,
+it shows improved generalizability when faced with new, out-of-distribution
+hospital images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at ICCV 2023, Computer Vision for Automated
+  Medical Diagnosis Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Brain Tumor Segmentation (BraTS) Challenge 2023: Local Synthesis of
+  Healthy Brain Tissue via Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Kofler, Felix Meissen, Felix Steinbauer, Robert Graf, Eva Oswald, Ezequiel de da Rosa, Hongwei Bran Li, Ujjwal Baid, Florian Hoelzl, Oezguen Turgut, Izabela Horvath, Diana Waldmannstetter, Christina Bukas, Maruf Adewole, Syed Muhammad Anwar, Anastasia Janas, Anahita Fathi Kazerooni, Dominic LaBella, Ahmed W Moawad, Keyvan Farahani, James Eddy, Timothy Bergquist, Verena Chung, Russell Takeshi Shinohara, Farouk Dako, Walter Wiggins, Zachary Reitman, Chunhao Wang, Xinyang Liu, Zhifan Jiang, Ariana Familiar, Gian-Marco Conte, Elaine Johanson, Zeke Meier, Christos Davatzikos, John Freymann, Justin Kirby, Michel Bilello, Hassan M Fathallah-Shaykh, Roland Wiest, Jan Kirschke, Rivka R Colen, Aikaterini Kotrotsou, Pamela Lamontagne, Daniel Marcus, Mikhail Milchenko, Arash Nazeri, Marc-André Weber, Abhishek Mahajan, Suyash Mohan, John Mongan, Christopher Hess, Soonmee Cha, Javier Villanueva-Meyer, Errol Colak, Priscila Crivellaro, Andras Jakab, Jake Albrecht, Udunna Anazodo, Mariam Aboian, Juan Eugenio Iglesias, Koen Van Leemput, Spyridon Bakas, Daniel Rueckert, Benedikt Wiestler, Ivan Ezhov, Marie Piraud, Bjoern Menze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A myriad of algorithms for the automatic analysis of brain MR images is
+available to support clinicians in their decision-making. For brain tumor
+patients, the image acquisition time series typically starts with a scan that
+is already pathological. This poses problems, as many algorithms are designed
+to analyze healthy brains and provide no guarantees for images featuring
+lesions. Examples include but are not limited to algorithms for brain anatomy
+parcellation, tissue segmentation, and brain extraction. To solve this dilemma,
+we introduce the BraTS 2023 inpainting challenge. Here, the participants' task
+is to explore inpainting techniques to synthesize healthy brain scans from
+lesioned ones. The following manuscript contains the task formulation, dataset,
+and submission procedure. Later it will be updated to summarize the findings of
+the challenge. The challenge is organized as part of the BraTS 2023 challenge
+hosted at the MICCAI 2023 conference in Vancouver, Canada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DETRs with Collaborative Hybrid Assignments Training <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12860v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12860v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuofan Zong, Guanglu Song, Yu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we provide the observation that too few queries assigned as
+positive samples in DETR with one-to-one set matching leads to sparse
+supervision on the encoder's output which considerably hurt the discriminative
+feature learning of the encoder and vice visa for attention learning in the
+decoder. To alleviate this, we present a novel collaborative hybrid assignments
+training scheme, namely $\mathcal{C}$o-DETR, to learn more efficient and
+effective DETR-based detectors from versatile label assignment manners. This
+new training scheme can easily enhance the encoder's learning ability in
+end-to-end detectors by training the multiple parallel auxiliary heads
+supervised by one-to-many label assignments such as ATSS and Faster RCNN. In
+addition, we conduct extra customized positive queries by extracting the
+positive coordinates from these auxiliary heads to improve the training
+efficiency of positive samples in the decoder. In inference, these auxiliary
+heads are discarded and thus our method introduces no additional parameters and
+computational cost to the original detector while requiring no hand-crafted
+non-maximum suppression (NMS). We conduct extensive experiments to evaluate the
+effectiveness of the proposed approach on DETR variants, including DAB-DETR,
+Deformable-DETR, and DINO-Deformable-DETR. The state-of-the-art
+DINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO
+val. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on
+COCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear
+margins with much fewer model sizes. Codes are available at
+\url{https://github.com/Sense-X/Co-DETR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Codes are available at https://github.com/Sense-X/Co-DETR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding CNN Hidden Neuron Activations Using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, demystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLIC: Multi-Reference Entropy Model for Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07273v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07273v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Jiayu Yang, Yongqi Zhai, Peirong Ning, Feng Gao, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, learned image compression has achieved remarkable performance. The
+entropy model, which estimates the distribution of the latent representation,
+plays a crucial role in boosting rate-distortion performance. However, most
+entropy models only capture correlations in one dimension, while the latent
+representation contain channel-wise, local spatial, and global spatial
+correlations. To tackle this issue, we propose the Multi-Reference Entropy
+Model (MEM) and the advanced version, MEM$^+$. These models capture the
+different types of correlations present in latent representation. Specifically,
+We first divide the latent representation into slices. When decoding the
+current slice, we use previously decoded slices as context and employ the
+attention map of the previously decoded slice to predict global correlations in
+the current slice. To capture local contexts, we introduce two enhanced
+checkerboard context capturing techniques that avoids performance degradation.
+Based on MEM and MEM$^+$, we propose image compression models MLIC and
+MLIC$^+$. Extensive experimental evaluations demonstrate that our MLIC and
+MLIC$^+$ models achieve state-of-the-art performance, reducing BD-rate by
+$8.05\%$ and $11.39\%$ on the Kodak dataset compared to VTM-17.0 when measured
+in PSNR. Our code will be available at https://github.com/JiangWeibeta/MLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetAug: Contrastive Learning via Meta Feature Augmentation <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.05119v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.05119v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangmeng Li, Wenwen Qiang, Changwen Zheng, Bing Su, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What matters for contrastive learning? We argue that contrastive learning
+heavily relies on informative features, or "hard" (positive or negative)
+features. Early works include more informative features by applying complex
+data augmentations and large batch size or memory bank, and recent works design
+elaborate sampling approaches to explore informative features. The key
+challenge toward exploring such features is that the source multi-view data is
+generated by applying random data augmentations, making it infeasible to always
+add useful information in the augmented data. Consequently, the informativeness
+of features learned from such augmented data is limited. In response, we
+propose to directly augment the features in latent space, thereby learning
+discriminative representations without a large amount of input data. We perform
+a meta learning technique to build the augmentation generator that updates its
+network parameters by considering the performance of the encoder. However,
+insufficient input data may lead the encoder to learn collapsed features and
+therefore malfunction the augmentation generator. A new margin-injected
+regularization is further added in the objective function to avoid the encoder
+learning a degenerate mapping. To contrast all features in one gradient
+back-propagation step, we adopt the proposed optimization-driven unified
+contrastive loss instead of the conventional contrastive loss. Empirically, our
+method achieves state-of-the-art results on several benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling Multiple Views via Implicitly Preserving Global Consistency and
+  Local Complementarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangmeng Li, Wenwen Qiang, Changwen Zheng, Bing Su, Farid Razzak, Ji-Rong Wen, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While self-supervised learning techniques are often used to mining implicit
+knowledge from unlabeled data via modeling multiple views, it is unclear how to
+perform effective representation learning in a complex and inconsistent
+context. To this end, we propose a methodology, specifically consistency and
+complementarity network (CoCoNet), which avails of strict global inter-view
+consistency and local cross-view complementarity preserving regularization to
+comprehensively learn representations from multiple views. On the global stage,
+we reckon that the crucial knowledge is implicitly shared among views, and
+enhancing the encoder to capture such knowledge from data can improve the
+discriminability of the learned representations. Hence, preserving the global
+consistency of multiple views ensures the acquisition of common knowledge.
+CoCoNet aligns the probabilistic distribution of views by utilizing an
+efficient discrepancy metric measurement based on the generalized sliced
+Wasserstein distance. Lastly on the local stage, we propose a heuristic
+complementarity-factor, which joints cross-view discriminative knowledge, and
+it guides the encoders to learn not only view-wise discriminability but also
+cross-view complementary information. Theoretically, we provide the
+information-theoretical-based analyses of our proposed CoCoNet. Empirically, to
+investigate the improvement gains of our approach, we conduct adequate
+experimental validations, which demonstrate that CoCoNet outperforms the
+state-of-the-art self-supervised methods by a significant margin proves that
+such implicit consistency and complementarity preserving regularization can
+enhance the discriminability of latent representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Knowledge and Data Engineering
+  (TKDE) 2022; Refer to https://ieeexplore.ieee.org/document/9857632</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lawin <span class="highlight-title">Transformer</span>: Improving Semantic Segmentation <span class="highlight-title">Transformer</span> with
+  Multi-Scale Representations via Large Window Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.01615v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.01615v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Yan, Chuang Zhang, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-scale representations are crucial for semantic segmentation. The
+community has witnessed the flourish of semantic segmentation convolutional
+neural networks (CNN) exploiting multi-scale contextual information. Motivated
+by that the vision transformer (ViT) is powerful in image classification, some
+semantic segmentation ViTs are recently proposed, most of them attaining
+impressive results but at a cost of computational economy. In this paper, we
+succeed in introducing multi-scale representations into semantic segmentation
+ViT via window attention mechanism and further improves the performance and
+efficiency. To this end, we introduce large window attention which allows the
+local window to query a larger area of context window at only a little
+computation overhead. By regulating the ratio of the context area to the query
+area, we enable the $\textit{large window attention}$ to capture the contextual
+information at multiple scales. Moreover, the framework of spatial pyramid
+pooling is adopted to collaborate with $\textit{the large window attention}$,
+which presents a novel decoder named $\textbf{la}$rge $\textbf{win}$dow
+attention spatial pyramid pooling (LawinASPP) for semantic segmentation ViT.
+Our resulting ViT, Lawin Transformer, is composed of an efficient hierachical
+vision transformer (HVT) as encoder and a LawinASPP as decoder. The empirical
+results demonstrate that Lawin Transformer offers an improved efficiency
+compared to the existing method. Lawin Transformer further sets new
+state-of-the-art performance on Cityscapes (84.4% mIoU), ADE20K (56.2% mIoU)
+and COCO-Stuff datasets. The code will be released at
+https://github.com/yan-hao-tian/lawin
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Align to Improve Zero-Shot Sketch-Based Image Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05144v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05144v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyin Dong, Mingrui Zhu, Nannan Wang, Xinbo Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot sketch-based image retrieval (ZS-SBIR) is challenging due to the
+cross-domain nature of sketches and photos, as well as the semantic gap between
+seen and unseen image distributions. Previous methods fine-tune pre-trained
+models with various side information and learning strategies to learn a compact
+feature space that is shared between the sketch and photo domains and bridges
+seen and unseen classes. However, these efforts are inadequate in adapting
+domains and transferring knowledge from seen to unseen classes. In this paper,
+we present an effective ``Adapt and Align'' approach to address the key
+challenges. Specifically, we insert simple and lightweight domain adapters to
+learn new abstract concepts of the sketch domain and improve cross-domain
+representation capabilities. Inspired by recent advances in image-text
+foundation models (e.g., CLIP) on zero-shot scenarios, we explicitly align the
+learned image embedding with a more semantic text embedding to achieve the
+desired knowledge transfer from seen to unseen classes. Extensive experiments
+on three benchmark datasets and two popular backbones demonstrate the
+superiority of our method in terms of retrieval accuracy and flexibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Ratatouille: Recycling Diverse Models for Out-of-Distribution
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10445v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10445v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Ramé, Kartik Ahuja, Jianyu Zhang, Matthieu Cord, Léon Bottou, David Lopez-Paz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models are redefining how AI systems are built. Practitioners now
+follow a standard procedure to build their machine learning solutions: from a
+pre-trained foundation model, they fine-tune the weights on the target task of
+interest. So, the Internet is swarmed by a handful of foundation models
+fine-tuned on many diverse tasks: these individual fine-tunings exist in
+isolation without benefiting from each other. In our opinion, this is a missed
+opportunity, as these specialized models contain rich and diverse features. In
+this paper, we thus propose model ratatouille, a new strategy to recycle the
+multiple fine-tunings of the same foundation model on diverse auxiliary tasks.
+Specifically, we repurpose these auxiliary weights as initializations for
+multiple parallel fine-tunings on the target task; then, we average all
+fine-tuned weights to obtain the final model. This recycling strategy aims at
+maximizing the diversity in weights by leveraging the diversity in auxiliary
+tasks. Empirically, it improves the state of the art on the reference DomainBed
+benchmark for out-of-distribution generalization. Looking forward, this work
+contributes to the emerging paradigm of updatable machine learning where, akin
+to open-source software development, the community collaborates to reliably
+update machine learning models. Our code is released:
+https://github.com/facebookresearch/ModelRatatouille.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 10 tables, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point
+  Clouds <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04383v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04383v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chensheng Peng, Guangming Wang, Xian Wan Lo, Xinrui Wu, Chenfeng Xu, Masayoshi Tomizuka, Wei Zhan, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds are naturally sparse, while image pixels are dense. The
+inconsistency limits feature fusion from both modalities for point-wise scene
+flow estimation. Previous methods rarely predict scene flow from the entire
+point clouds of the scene with one-time inference due to the memory
+inefficiency and heavy overhead from distance calculation and sorting involved
+in commonly used farthest point sampling, KNN, and ball query algorithms for
+local feature aggregation. To mitigate these issues in scene flow learning, we
+regularize raw points to a dense format by storing 3D coordinates in 2D grids.
+Unlike the sampling operation commonly used in existing works, the dense 2D
+representation 1) preserves most points in the given scene, 2) brings in a
+significant boost of efficiency, and 3) eliminates the density gap between
+points and pixels, allowing us to perform effective feature fusion. We also
+present a novel warping projection technique to alleviate the information loss
+problem resulting from the fact that multiple points could be mapped into one
+grid during projection when computing cost volume. Sufficient experiments
+demonstrate the efficiency and effectiveness of our method, outperforming the
+prior-arts on the FlyingThings3D and KITTI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes will be released at
+  https://github.com/IRMVLab/DELFlow</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Constraints Enable <span class="highlight-title">Self-Supervised</span> Sinogram Inpainting in
+  Sparse-View Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Wagner, Mareike Thies, Noah Maul, Laura Pfaff, Oliver Aust, Sabrina Pechmann, Christopher Syben, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diagnostic quality of computed tomography (CT) scans is usually
+restricted by the induced patient dose, scan speed, and image quality.
+Sparse-angle tomographic scans reduce radiation exposure and accelerate data
+acquisition, but suffer from image artifacts and noise. Existing image
+processing algorithms can restore CT reconstruction quality but often require
+large training data sets or can not be used for truncated objects. This work
+presents a self-supervised projection inpainting method that allows optimizing
+missing projective views via gradient-based optimization. By reconstructing
+independent stacks of projection data, a self-supervised loss is calculated in
+the CT image domain and used to directly optimize projection image intensities
+to match the missing tomographic views constrained by the projection geometry.
+Our experiments on real X-ray microscope (XRM) tomographic mouse tibia bone
+scans show that our method improves reconstructions by 3.1-7.4%/7.7-17.6% in
+terms of PSNR/SSIM with respect to the interpolation baseline. Our approach is
+applicable as a flexible self-supervised projection inpainting tool for
+tomographic applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Person Re-Identification without Identification via Event Anonymization <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04402v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04402v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafiq Ahmad, Pietro Morerio, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide-scale use of visual surveillance in public spaces puts individual
+privacy at stake while increasing resource consumption (energy, bandwidth, and
+computation). Neuromorphic vision sensors (event-cameras) have been recently
+considered a valid solution to the privacy issue because they do not capture
+detailed RGB visual information of the subjects in the scene. However, recent
+deep learning architectures have been able to reconstruct images from event
+cameras with high fidelity, reintroducing a potential threat to privacy for
+event-based vision applications. In this paper, we aim to anonymize
+event-streams to protect the identity of human subjects against such image
+reconstruction attacks. To achieve this, we propose an end-to-end network
+architecture jointly optimized for the twofold objective of preserving privacy
+and performing a downstream task such as person ReId. Our network learns to
+scramble events, enforcing the degradation of images recovered from the privacy
+attacker. In this work, we also bring to the community the first ever
+event-based person ReId dataset gathered to evaluate the performance of our
+approach. We validate our approach with extensive experiments and report
+results on the synthetic event data simulated from the publicly available
+SoftBio dataset and our proposed Event-ReId dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Computer Vision (ICCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diff-TTSG: Denoising probabilistic integrated speech and gesture
+  synthesis <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09417v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09417v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Mehta, Siyang Wang, Simon Alexanderson, Jonas Beskow, Éva Székely, Gustav Eje Henter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With read-aloud speech synthesis achieving high naturalness scores, there is
+a growing research interest in synthesising spontaneous speech. However, human
+spontaneous face-to-face conversation has both spoken and non-verbal aspects
+(here, co-speech gestures). Only recently has research begun to explore the
+benefits of jointly synthesising these two modalities in a single system. The
+previous state of the art used non-probabilistic methods, which fail to capture
+the variability of human speech and motion, and risk producing oversmoothing
+artefacts and sub-optimal synthesis quality. We present the first
+diffusion-based probabilistic model, called Diff-TTSG, that jointly learns to
+synthesise speech and gestures together. Our method can be trained on small
+datasets from scratch. Furthermore, we describe a set of careful uni- and
+multi-modal subjective tests for evaluating integrated speech and gesture
+synthesis systems, and use them to validate our proposed approach. Please see
+https://shivammehta25.github.io/Diff-TTSG/ for video examples, data, and code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, presented at the ISCA Speech Synthesis Workshop
+  (SSW) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surface Normal Clustering for Implicit Representation of Manhattan
+  Scenes <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikola Popovic, Danda Pani Paudel, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novel view synthesis and 3D modeling using implicit neural field
+representation are shown to be very effective for calibrated multi-view
+cameras. Such representations are known to benefit from additional geometric
+and semantic supervision. Most existing methods that exploit additional
+supervision require dense pixel-wise labels or localized scene priors. These
+methods cannot benefit from high-level vague scene priors provided in terms of
+scenes' descriptions. In this work, we aim to leverage the geometric prior of
+Manhattan scenes to improve the implicit neural radiance field representations.
+More precisely, we assume that only the knowledge of the indoor scene (under
+investigation) being Manhattan is known -- with no additional information
+whatsoever -- with an unknown Manhattan coordinate frame. Such high-level prior
+is used to self-supervise the surface normals derived explicitly in the
+implicit neural fields. Our modeling allows us to cluster the derived normals
+and exploit their orthogonality constraints for self-supervision. Our
+exhaustive experiments on datasets of diverse indoor scenes demonstrate the
+significant benefit of the proposed method over the established baselines. The
+source code will be available at
+https://github.com/nikola3794/normal-clustering-nerf.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Images Generated by Deep Diffusion Models using their Local
+  Intrinsic Dimensionality <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02347v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02347v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Lorenz, Ricard Durall, Janis Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models recently have been successfully applied for the visual
+synthesis of strikingly realistic appearing images. This raises strong concerns
+about their potential for malicious purposes. In this paper, we propose using
+the lightweight multi Local Intrinsic Dimensionality (multiLID), which has been
+originally developed in context of the detection of adversarial examples, for
+the automatic detection of synthetic images and the identification of the
+according generator networks. In contrast to many existing detection
+approaches, which often only work for GAN-generated images, the proposed method
+provides close to perfect detection results in many realistic use cases.
+Extensive experiments on known and newly created datasets demonstrate that the
+proposed multiLID approach exhibits superiority in diffusion detection and
+model identification. Since the empirical evaluations of recent publications on
+the detection of generated images are often mainly focused on the
+"LSUN-Bedroom" dataset, we further establish a comprehensive benchmark for the
+detection of diffusion-generated images, including samples from several
+diffusion models with different image sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV WS DFAD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoLep: Single-View View Synthesis with Locally-Learned Planes and
+  Self-Attention Occlusion Inference <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12217v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12217v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wang, Yu-Ping Wang, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method, LoLep, which regresses Locally-Learned planes from
+a single RGB image to represent scenes accurately, thus generating better novel
+views. Without the depth information, regressing appropriate plane locations is
+a challenging problem. To solve this issue, we pre-partition the disparity
+space into bins and design a disparity sampler to regress local offsets for
+multiple planes in each bin. However, only using such a sampler makes the
+network not convergent; we further propose two optimizing strategies that
+combine with different disparity distributions of datasets and propose an
+occlusion-aware reprojection loss as a simple yet effective geometric
+supervision technique. We also introduce a self-attention mechanism to improve
+occlusion inference and present a Block-Sampling Self-Attention (BS-SA) module
+to address the problem of applying self-attention to large feature maps. We
+demonstrate the effectiveness of our approach and generate state-of-the-art
+results on different datasets. Compared to MINE, our approach has an LPIPS
+reduction of 4.8%-9.0% and an RV reduction of 73.9%-83.5%. We also evaluate the
+performance on real-world images and demonstrate the benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Architecture Search for Visual Anomaly Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08975v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08975v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tommie Kerssies, Joaquin Vanschoren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the first application of neural architecture search to
+the complex task of segmenting visual anomalies. Measurement of anomaly
+segmentation performance is challenging due to imbalanced anomaly pixels,
+varying region areas, and various types of anomalies. First, the
+region-weighted Average Precision (rwAP) metric is proposed as an alternative
+to existing metrics, which does not need to be limited to a specific maximum
+false positive rate. Second, the AutoPatch neural architecture search method is
+proposed, which enables efficient segmentation of visual anomalies without any
+training. By leveraging a pre-trained supernet, a black-box optimization
+algorithm can directly minimize computational complexity and maximize
+performance on a small validation set of anomalous examples. Finally,
+compelling results are presented on the widely studied MVTec dataset,
+demonstrating that AutoPatch outperforms the current state-of-the-art with
+lower computational complexity, using only one example per type of anomaly. The
+results highlight the potential of automated machine learning to optimize
+throughput in industrial quality control. The code for AutoPatch is available
+at: https://github.com/tommiekerssies/AutoPatch
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main track paper for the International Conference on Automated
+  Machine Learning (AutoML Conference), published in Proceedings of Machine
+  Learning Research (PMLR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A lightweight target detection algorithm based on Mobilenet Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.03729v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.03729v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Kuchuk, Shengquan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target detection algorithm based on deep learning needs high computer GPU
+configuration, even need to use high performance deep learning workstation,
+this not only makes the cost increase, also greatly limits the realizability of
+the ground, this paper introduces a kind of lightweight algorithm for target
+detection under the condition of the balance accuracy and computational
+efficiency, MobileNet as Backbone performs parameter The processing speed is
+30fps on the RTX2060 card for images with the CNN separator layer. The
+processing speed is 30fps on the RTX2060 card for images with a resolution of
+320*320.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Place Recognition: A Tutorial 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Schubert, Peer Neubert, Sourav Garg, Michael Milford, Tobias Fischer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Localization is an essential capability for mobile robots. A rapidly growing
+field of research in this area is Visual Place Recognition (VPR), which is the
+ability to recognize previously seen places in the world based solely on
+images. This present work is the first tutorial paper on visual place
+recognition. It unifies the terminology of VPR and complements prior research
+in two important directions: 1) It provides a systematic introduction for
+newcomers to the field, covering topics such as the formulation of the VPR
+problem, a general-purpose algorithmic pipeline, an evaluation methodology for
+VPR approaches, and the major challenges for VPR and how they may be addressed.
+2) As a contribution for researchers acquainted with the VPR problem, it
+examines the intricacies of different VPR problem types regarding input, data
+processing, and output. The tutorial also discusses the subtleties behind the
+evaluation of VPR algorithms, e.g., the evaluation of a VPR system that has to
+find all matching database images per query, as opposed to just a single match.
+Practical code examples in Python illustrate to prospective practitioners and
+researchers how VPR is implemented and evaluated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Robotics & Automation Magazine (RAM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlantDet: A benchmark for Plant Detection in the Three-Rivers-Source
+  Region <span class="chip">ICANN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04963v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04963v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huanhuan Li, Xuechao Zou, Yu-an Zhang, Jiangcai Zhaba, Guomei Li, Lamao Yongga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Three-River-Source region is a highly significant natural reserve in
+China that harbors a plethora of botanical resources. To meet the practical
+requirements of botanical research and intelligent plant management, we
+construct a dataset for Plant detection in the Three-River-Source region
+(PTRS). It comprises 21 types, 6965 high-resolution images of 2160*3840 pixels,
+captured by diverse sensors and platforms, and featuring objects of varying
+shapes and sizes. The PTRS presents us with challenges such as dense occlusion,
+varying leaf resolutions, and high feature similarity among plants, prompting
+us to develop a novel object detection network named PlantDet. This network
+employs a window-based efficient self-attention module (ST block) to generate
+robust feature representation at multiple scales, improving the detection
+efficiency for small and densely-occluded objects. Our experimental results
+validate the efficacy of our proposed plant detection benchmark, with a
+precision of 88.1%, a mean average precision (mAP) of 77.6%, and a higher
+recall compared to the baseline. Additionally, our method effectively overcomes
+the issue of missing small objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICANN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Contextual Data Augmentation for Generalizable Melanoma
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05116v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05116v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nick DiSanto, Gavin Harding, Ethan Martinez, Benjamin Sanders
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While skin cancer detection has been a valuable deep learning application for
+years, its evaluation has often neglected the context in which testing images
+are assessed. Traditional melanoma classifiers assume that their testing
+environments are comparable to the structured images they are trained on. This
+paper challenges this notion and argues that mole size, a critical attribute in
+professional dermatology, can be misleading in automated melanoma detection.
+While malignant melanomas tend to be larger than benign melanomas, relying
+solely on size can be unreliable and even harmful when contextual scaling of
+images is not possible. To address this issue, this implementation proposes a
+custom model that performs various data augmentation procedures to prevent
+overfitting to incorrect parameters and simulate real-world usage of melanoma
+detection applications. Multiple custom models employing different forms of
+data augmentation are implemented to highlight the most significant features of
+mole classifiers. These implementations emphasize the importance of considering
+user unpredictability when deploying such applications. The caution required
+when manually modifying data is acknowledged, as it can result in data loss and
+biased conclusions. Additionally, the significance of data augmentation in both
+the dermatology and deep learning communities is considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Novel Convolutional Neural Network Architecture with a Continuous
+  Symmetry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Liu, Hang Shao, Bing Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new Convolutional Neural Network (ConvNet)
+architecture inspired by a class of partial differential equations (PDEs)
+called quasi-linear hyperbolic systems. With comparable performance on the
+image classification task, it allows for the modification of the weights via a
+continuous group of symmetry. This is a significant shift from traditional
+models where the architecture and weights are essentially fixed. We wish to
+promote the (internal) symmetry as a new desirable property for a neural
+network, and to draw attention to the PDE perspective in analyzing and
+interpreting ConvNets in the broader Deep Learning community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 3rd CAAI International Conference on Artificial
+  Intelligence (CICAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InterFormer: Real-time Interactive Image Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02942v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02942v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        You Huang, Hao Yang, Ke Sun, Shengchuan Zhang, Liujuan Cao, Guannan Jiang, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive image segmentation enables annotators to efficiently perform
+pixel-level annotation for segmentation tasks. However, the existing
+interactive segmentation pipeline suffers from inefficient computations of
+interactive models because of the following two issues. First, annotators'
+later click is based on models' feedback of annotators' former click. This
+serial interaction is unable to utilize model's parallelism capabilities.
+Second, in each interaction step, the model handles the invariant image along
+with the sparse variable clicks, resulting in a process that's highly
+repetitive and redundant. For efficient computations, we propose a method named
+InterFormer that follows a new pipeline to address these issues. InterFormer
+extracts and preprocesses the computationally time-consuming part i.e. image
+processing from the existing process. Specifically, InterFormer employs a large
+vision transformer (ViT) on high-performance devices to preprocess images in
+parallel, and then uses a lightweight module called interactive multi-head self
+attention (I-MSA) for interactive segmentation. Furthermore, the I-MSA module's
+deployment on low-power devices extends the practical application of
+interactive segmentation. The I-MSA module utilizes the preprocessed features
+to efficiently response to the annotator inputs in real-time. The experiments
+on several datasets demonstrate the effectiveness of InterFormer, which
+outperforms previous interactive segmentation models in terms of computational
+efficiency and segmentation quality, achieve real-time high-quality interactive
+segmentation on CPU-only devices. The code is available at
+https://github.com/YouHuang67/InterFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01937v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01937v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhong Chen, Fengbei Liu, Hu Wang, Chong Wang, Yu Tian, Yuyuan Liu, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods have shown outstanding classification accuracy in
+medical imaging problems, which is largely attributed to the availability of
+large-scale datasets manually annotated with clean labels. However, given the
+high cost of such manual annotation, new medical imaging classification
+problems may need to rely on machine-generated noisy labels extracted from
+radiology reports. Indeed, many Chest X-ray (CXR) classifiers have already been
+modelled from datasets with noisy labels, but their training procedure is in
+general not robust to noisy-label samples, leading to sub-optimal models.
+Furthermore, CXR datasets are mostly multi-label, so current noisy-label
+learning methods designed for multi-class problems cannot be easily adapted. In
+this paper, we propose a new method designed for the noisy multi-label CXR
+learning, which detects and smoothly re-labels samples from the dataset, which
+is then used to train common multi-label classifiers. The proposed method
+optimises a bag of multi-label descriptors (BoMD) to promote their similarity
+with the semantic descriptors produced by BERT models from the multi-label
+image annotation. Our experiments on diverse noisy multi-label training sets
+and clean testing sets show that our model has state-of-the-art accuracy and
+robustness in many CXR multi-label classification benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/cyh-0/BoMD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-Aware Token for Weakly Supervised Object Localization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pingyu Wu, Wei Zhai, Yang Cao, Jiebo Luo, Zheng-Jun Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised object localization (WSOL) is a challenging task aiming to
+localize objects with only image-level supervision. Recent works apply visual
+transformer to WSOL and achieve significant success by exploiting the
+long-range feature dependency in self-attention mechanism. However, existing
+transformer-based methods synthesize the classification feature maps as the
+localization map, which leads to optimization conflicts between classification
+and localization tasks. To address this problem, we propose to learn a
+task-specific spatial-aware token (SAT) to condition localization in a weakly
+supervised manner. Specifically, a spatial token is first introduced in the
+input space to aggregate representations for localization task. Then a spatial
+aware attention module is constructed, which allows spatial token to generate
+foreground probabilities of different patches by querying and to extract
+localization knowledge from the classification task. Besides, for the problem
+of sparse and unbalanced pixel-level supervision obtained from the image-level
+label, two spatial constraints, including batch area loss and normalization
+loss, are designed to compensate and enhance this supervision. Experiments show
+that the proposed SAT achieves state-of-the-art performance on both CUB-200 and
+ImageNet, with 98.45% and 73.13% GT-known Loc, respectively. Even under the
+extreme setting of using only 1 image per class from ImageNet for training, SAT
+already exceeds the SOTA method by 2.1% GT-known Loc. Code and models are
+available at https://github.com/wpy1999/SAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code:https://github.com/wpy1999/SAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DocDiff: Document Enhancement via Residual Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03892v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03892v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongyuan Yang, Baolin Liu, Yongping Xiong, Lan Yi, Guibin Wu, Xiaojun Tang, Ziqi Liu, Junjie Zhou, Xing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Removing degradation from document images not only improves their visual
+quality and readability, but also enhances the performance of numerous
+automated document analysis and recognition tasks. However, existing
+regression-based methods optimized for pixel-level distortion reduction tend to
+suffer from significant loss of high-frequency information, leading to
+distorted and blurred text edges. To compensate for this major deficiency, we
+propose DocDiff, the first diffusion-based framework specifically designed for
+diverse challenging document enhancement problems, including document
+deblurring, denoising, and removal of watermarks and seals. DocDiff consists of
+two modules: the Coarse Predictor (CP), which is responsible for recovering the
+primary low-frequency content, and the High-Frequency Residual Refinement (HRR)
+module, which adopts the diffusion models to predict the residual
+(high-frequency information, including text edges), between the ground-truth
+and the CP-predicted image. DocDiff is a compact and computationally efficient
+model that benefits from a well-designed network architecture, an optimized
+training loss objective, and a deterministic sampling process with short time
+steps. Extensive experiments demonstrate that DocDiff achieves state-of-the-art
+(SOTA) performance on multiple benchmark datasets, and can significantly
+enhance the readability and recognizability of degraded document images.
+Furthermore, our proposed HRR module in pre-trained DocDiff is plug-and-play
+and ready-to-use, with only 4.17M parameters. It greatly sharpens the text
+edges generated by SOTA deblurring methods without additional joint training.
+Available codes: https://github.com/Royalvice/DocDiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D-Aware Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14797v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14797v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sherwin Bahmani, Jeong Joon Park, Despoina Paschalidou, Hao Tang, Gordon Wetzstein, Leonidas Guibas, Luc Van Gool, Radu Timofte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models have emerged as an essential building block for many image
+synthesis and editing tasks. Recent advances in this field have also enabled
+high-quality 3D or video content to be generated that exhibits either
+multi-view or temporal consistency. With our work, we explore 4D generative
+adversarial networks (GANs) that learn unconditional generation of 3D-aware
+videos. By combining neural implicit representations with time-aware
+discriminator, we develop a GAN framework that synthesizes 3D video supervised
+only with monocular videos. We show that our method learns a rich embedding of
+decomposable 3D structures and motions that enables new visual effects of
+spatio-temporal renderings while producing imagery with quality comparable to
+that of existing 3D or video GANs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; Project page: https://sherwinbahmani.github.io/3dvidgen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rickrolling the Artist: Injecting Backdoors into Text Encoders for
+  Text-to-Image Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.02408v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.02408v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Struppek, Dominik Hintersdorf, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While text-to-image synthesis currently enjoys great popularity among
+researchers and the general public, the security of these models has been
+neglected so far. Many text-guided image generation models rely on pre-trained
+text encoders from external sources, and their users trust that the retrieved
+models will behave as promised. Unfortunately, this might not be the case. We
+introduce backdoor attacks against text-guided generative models and
+demonstrate that their text encoders pose a major tampering risk. Our attacks
+only slightly alter an encoder so that no suspicious model behavior is apparent
+for image generations with clean prompts. By then inserting a single character
+trigger into the prompt, e.g., a non-Latin character or emoji, the adversary
+can trigger the model to either generate images with pre-defined attributes or
+images following a hidden, potentially malicious description. We empirically
+demonstrate the high effectiveness of our attacks on Stable Diffusion and
+highlight that the injection process of a single backdoor takes less than two
+minutes. Besides phrasing our approach solely as an attack, it can also force
+an encoder to forget phrases related to certain concepts, such as nudity or
+violence, and help to make image generation safer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounding 3D Object Affordance from 2D Interactions in Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Yang, Wei Zhai, Hongchen Luo, Yang Cao, Jiebo Luo, Zheng-Jun Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grounding 3D object affordance seeks to locate objects' ''action
+possibilities'' regions in the 3D space, which serves as a link between
+perception and operation for embodied agents. Existing studies primarily focus
+on connecting visual affordances with geometry structures, e.g. relying on
+annotations to declare interactive regions of interest on the object and
+establishing a mapping between the regions and affordances. However, the
+essence of learning object affordance is to understand how to use it, and the
+manner that detaches interactions is limited in generalization. Normally,
+humans possess the ability to perceive object affordances in the physical world
+through demonstration images or videos. Motivated by this, we introduce a novel
+task setting: grounding 3D object affordance from 2D interactions in images,
+which faces the challenge of anticipating affordance through interactions of
+different sources. To address this problem, we devise a novel
+Interaction-driven 3D Affordance Grounding Network (IAG), which aligns the
+region feature of objects from different sources and models the interactive
+contexts for 3D object affordance grounding. Besides, we collect a Point-Image
+Affordance Dataset (PIAD) to support the proposed task. Comprehensive
+experiments on PIAD demonstrate the reliability of the proposed task and the
+superiority of our method. The project is available at
+https://github.com/yyvhang/IAGNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023, camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AttentionViz: A Global View of <span class="highlight-title">Transformer</span> Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Yeh, Yida Chen, Aoyu Wu, Cynthia Chen, Fernanda Viégas, Martin Wattenberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are revolutionizing machine learning, but their inner
+workings remain mysterious. In this work, we present a new visualization
+technique designed to help researchers understand the self-attention mechanism
+in transformers that allows these models to learn rich, contextual
+relationships between elements of a sequence. The main idea behind our method
+is to visualize a joint embedding of the query and key vectors used by
+transformer models to compute attention. Unlike previous attention
+visualization techniques, our approach enables the analysis of global patterns
+across multiple input sequences. We create an interactive visualization tool,
+AttentionViz (demo: http://attentionviz.com), based on these joint query-key
+embeddings, and use it to study attention mechanisms in both language and
+vision transformers. We demonstrate the utility of our approach in improving
+model understanding and offering new insights about query-key interactions
+through several application scenarios and expert feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16839v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16839v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weicheng Kuo, AJ Piergiovanni, Dahun Kim, Xiyang Luo, Ben Caine, Wei Li, Abhijit Ogale, Luowei Zhou, Andrew Dai, Zhifeng Chen, Claire Cui, Anelia Angelova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of language models have moved from encoder-decoder to
+decoder-only designs. In addition, we observe that the two most popular
+multimodal tasks, the generative and contrastive tasks, are nontrivial to
+accommodate in one architecture, and further need adaptations for downstream
+tasks. We propose a novel paradigm of training with a decoder-only model for
+multimodal tasks, which is surprisingly effective in jointly learning of these
+disparate vision-language tasks. This is done with a simple model, called
+MaMMUT. It consists of a single vision encoder and a text decoder, and is able
+to accommodate contrastive and generative learning by a novel two-pass approach
+on the text decoder. We demonstrate that joint learning of these diverse
+objectives is simple, effective, and maximizes the weight-sharing of the model
+across these tasks. Furthermore, the same architecture enables straightforward
+extensions to open-vocabulary object detection and video-language tasks. The
+model tackles a diverse range of tasks, while being modest in capacity. Our
+model achieves the state of the art on image-text and text-image retrieval,
+video question answering and open-vocabulary detection tasks, outperforming
+much larger and more extensively trained foundational models. It shows very
+competitive results on VQA and Video Captioning, especially considering its
+capacity. Ablations confirm the flexibility and advantages of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (
+  https://jmlr.org/tmlr/ ). 18 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object
+  Detection System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanlong Yang, Jianan Liu, Tao Huang, Qing-Long Han, Gang Ma, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving systems, LiDAR and radar play important roles in the
+perception of the surrounding environment. LiDAR provides accurate 3D spatial
+sensing information but cannot work in adverse weather like fog. On the other
+hand, the radar signal can be diffracted when encountering raindrops or mist
+particles thanks to its wavelength, but it suffers from large noise. Recent
+state-of-the-art works reveal that fusion of radar and LiDAR can lead to robust
+detection in adverse weather. The existing works adopt convolutional neural
+network architecture to extract features from each sensor data stream, then
+align and aggregate the two branch features to predict object detection
+results. However, these methods have low accuracy of bounding box estimations
+due to a simple design of label assignment and fusion strategies. In this
+paper, we propose a bird's-eye view fusion learning-based anchor box-free
+object detection system, which fuses the feature derived from the radar
+range-azimuth heatmap and the LiDAR point cloud to estimate the possible
+objects. Different label assignment strategies have been designed to facilitate
+the consistency between the classification of foreground or background anchor
+points and the corresponding bounding box regressions. In addition, the
+performance of the proposed object detector is further enhanced by employing a
+novel interactive transformer module. The superior performance of the methods
+proposed in this paper has been demonstrated using the recently published
+Oxford Radar RobotCar dataset. Our system's average precision significantly
+outperforms the best state-of-the-art method by 13.1% and 19.0% at IoU of 0.8
+under 'Clear+Foggy' training conditions for 'Clear' and 'Foggy' testing,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing the Power of <span class="highlight-title">Self-Supervised</span> Image Denoising: A Comprehensive
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Zhang, Fangfang Zhou, Yuanzhou Wei, Xiao Yang, Yuan Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of deep learning has brought a revolutionary transformation to
+image denoising techniques. However, the persistent challenge of acquiring
+noise-clean pairs for supervised methods in real-world scenarios remains
+formidable, necessitating the exploration of more practical self-supervised
+image denoising. This paper focuses on self-supervised image denoising methods
+that offer effective solutions to address this challenge. Our comprehensive
+review thoroughly analyzes the latest advancements in self-supervised image
+denoising approaches, categorizing them into three distinct classes: General
+methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.
+For each class, we provide a concise theoretical analysis along with their
+practical applications. To assess the effectiveness of these methods, we
+present both quantitative and qualitative experimental results on various
+datasets, utilizing classical algorithms as benchmarks. Additionally, we
+critically discuss the current limitations of these methods and propose
+promising directions for future research. By offering a detailed overview of
+recent developments in self-supervised image denoising, this review serves as
+an invaluable resource for researchers and practitioners in the field,
+facilitating a deeper understanding of this emerging domain and inspiring
+further advancements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVDiffusion: Enabling Holistic Multi-view Image Generation with
+  Correspondence-Aware Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01097v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01097v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitao Tang, Fuyang Zhang, Jiacheng Chen, Peng Wang, Yasutaka Furukawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces MVDiffusion, a simple yet effective multi-view image
+generation method for scenarios where pixel-to-pixel correspondences are
+available, such as perspective crops from panorama or multi-view images given
+geometry (depth maps and poses). Unlike prior models that rely on iterative
+image warping and inpainting, MVDiffusion concurrently generates all images
+with a global awareness, encompassing high resolution and rich content,
+effectively addressing the error accumulation prevalent in preceding models.
+MVDiffusion specifically incorporates a correspondence-aware attention
+mechanism, enabling effective cross-view interaction. This mechanism underpins
+three pivotal modules: 1) a generation module that produces low-resolution
+images while maintaining global correspondence, 2) an interpolation module that
+densifies spatial coverage between images, and 3) a super-resolution module
+that upscales into high-resolution outputs. In terms of panoramic imagery,
+MVDiffusion can generate high-resolution photorealistic images up to
+1024$\times$1024 pixels. For geometry-conditioned multi-view image generation,
+MVDiffusion demonstrates the first method capable of generating a textured map
+of a scene mesh. The project page is at https://mvdiffusion.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page, https://mvdiffusion.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for
+  Efficient 3D Object Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08209v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08209v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchen Zhao, Xuefei Ning, Ke Hong, Zhongyuan Qiu, Pu Lu, Yali Zhao, Linfeng Zhang, Lipu Zhou, Guohao Dai, Huazhong Yang, Yu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voxel-based methods have achieved state-of-the-art performance for 3D object
+detection in autonomous driving. However, their significant computational and
+memory costs pose a challenge for their application to resource-constrained
+vehicles. One reason for this high resource consumption is the presence of a
+large number of redundant background points in Lidar point clouds, resulting in
+spatial redundancy in both 3D voxel and dense BEV map representations. To
+address this issue, we propose an adaptive inference framework called Ada3D,
+which focuses on exploiting the input-level spatial redundancy. Ada3D
+adaptively filters the redundant input, guided by a lightweight importance
+predictor and the unique properties of the Lidar point cloud. Additionally, we
+utilize the BEV features' intrinsic sparsity by introducing the Sparsity
+Preserving Batch Normalization. With Ada3D, we achieve 40% reduction for 3D
+voxels and decrease the density of 2D BEV feature maps from 100% to 20% without
+sacrificing accuracy. Ada3D reduces the model computational and memory cost by
+5x, and achieves 1.52x/1.45x end-to-end GPU latency and 1.5x/4.5x GPU peak
+memory optimization for the 3D and 2D backbone respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mystique: Deconstructing SVG Charts for Layout Reuse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chen, Bongshin Lee, Yunhai Wang, Yunjeong Chang, Zhicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate the reuse of existing charts, previous research has examined
+how to obtain a semantic understanding of a chart by deconstructing its visual
+representation into reusable components, such as encodings. However, existing
+deconstruction approaches primarily focus on chart styles, handling only basic
+layouts. In this paper, we investigate how to deconstruct chart layouts,
+focusing on rectangle-based ones, as they cover not only 17 chart types but
+also advanced layouts (e.g., small multiples, nested layouts). We develop an
+interactive tool, called Mystique, adopting a mixed-initiative approach to
+extract the axes and legend, and deconstruct a chart's layout into four
+semantic components: mark groups, spatial relationships, data encodings, and
+graphical constraints. Mystique employs a wizard interface that guides chart
+authors through a series of steps to specify how the deconstructed components
+map to their own data. On 150 rectangle-based SVG charts, Mystique achieves
+above 85% accuracy for axis and legend extraction and 96% accuracy for layout
+deconstruction. In a chart reproduction study, participants could easily reuse
+existing charts on new datasets. We discuss the current limitations of Mystique
+and future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 2023 IEEE Visualization Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One-Shot Neural Fields for 3D Object Understanding <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12126v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12126v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valts Blukis, Taeyeop Lee, Jonathan Tremblay, Bowen Wen, In So Kweon, Kuk-Jin Yoon, Dieter Fox, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified and compact scene representation for robotics, where
+each object in the scene is depicted by a latent code capturing geometry and
+appearance. This representation can be decoded for various tasks such as novel
+view rendering, 3D reconstruction (e.g. recovering depth, point clouds, or
+voxel maps), collision checking, and stable grasp prediction. We build our
+representation from a single RGB input image at test time by leveraging recent
+advances in Neural Radiance Fields (NeRF) that learn category-level priors on
+large multiview datasets, then fine-tune on novel objects from one or few
+views. We expand the NeRF model for additional grasp outputs and explore ways
+to leverage this representation for robotics. At test-time, we build the
+representation from a single RGB input image observing the scene from only one
+viewpoint. We find that the recovered representation allows rendering from
+novel views, including of occluded object parts, and also for predicting
+successful stable grasps. Grasp poses can be directly decoded from our latent
+representation with an implicit grasp decoder. We experimented in both
+simulation and real world and demonstrated the capability for robust robotic
+grasping using such compact representation. Website:
+https://nerfgrasp.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/CVF Conference on Computer Vision and Pattern Recognition
+  Workshop (CVPRW) on XRNeRF: Advances in NeRF for the Metaverse 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Overlooked Implications of the Reconstruction Loss for VAE
+  Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.13341v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.13341v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Michlo, Richard Klein, Steven James
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning disentangled representations with variational autoencoders (VAEs) is
+often attributed to the regularisation component of the loss. In this work, we
+highlight the interaction between data and the reconstruction term of the loss
+as the main contributor to disentanglement in VAEs. We show that standard
+benchmark datasets have unintended correlations between their subjective
+ground-truth factors and perceived axes in the data according to typical VAE
+reconstruction losses. Our work exploits this relationship to provide a theory
+for what constitutes an adversarial dataset under a given reconstruction loss.
+We verify this by constructing an example dataset that prevents disentanglement
+in state-of-the-art frameworks while maintaining human-intuitive ground-truth
+factors. Finally, we re-enable disentanglement by designing an example
+reconstruction loss that is once again able to perceive the ground-truth
+factors. Our findings demonstrate the subjective nature of disentanglement and
+the importance of considering the interaction between the ground-truth factors,
+data and notably, the reconstruction loss, which is under-recognised in the
+literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Retrospective k-space Subsampling schemes For Deep MRI Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08365v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08365v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Yiasemis, Clara I. Sánchez, Jan-Jakob Sonke, Jonas Teuwen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring fully-sampled MRI $k$-space data is time-consuming, and collecting
+accelerated data can reduce the acquisition time. Employing 2D
+Cartesian-rectilinear subsampling schemes is a conventional approach for
+accelerated acquisitions; however, this often results in imprecise
+reconstructions, even with the use of Deep Learning (DL), especially at high
+acceleration factors. Non-rectilinear or non-Cartesian trajectories can be
+implemented in MRI scanners as alternative subsampling options. This work
+investigates the impact of the $k$-space subsampling scheme on the quality of
+reconstructed accelerated MRI measurements produced by trained DL models. The
+Recurrent Variational Network (RecurrentVarNet) was used as the DL-based
+MRI-reconstruction architecture. Cartesian, fully-sampled multi-coil $k$-space
+measurements from three datasets were retrospectively subsampled with different
+accelerations using eight distinct subsampling schemes: four
+Cartesian-rectilinear, two Cartesian non-rectilinear, and two non-Cartesian.
+Experiments were conducted in two frameworks: scheme-specific, where a distinct
+model was trained and evaluated for each dataset-subsampling scheme pair, and
+multi-scheme, where for each dataset a single model was trained on data
+randomly subsampled by any of the eight schemes and evaluated on data
+subsampled by all schemes. In both frameworks, RecurrentVarNets trained and
+evaluated on non-rectilinearly subsampled data demonstrated superior
+performance, particularly for high accelerations. In the multi-scheme setting,
+reconstruction performance on rectilinearly subsampled data improved when
+compared to the scheme-specific experiments. Our findings demonstrate the
+potential for using DL-based methods, trained on non-rectilinearly subsampled
+measurements, to optimize scan time and image quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 12 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Brain segmentation based on multi-atlas guided 3D fully convolutional
+  network ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1901.01381v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1901.01381v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiong Wu, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we proposed and validated a multi-atlas guided 3D fully
+convolutional network (FCN) ensemble model (M-FCN) for segmenting brain regions
+of interest (ROIs) from structural magnetic resonance images (MRIs). One major
+limitation of existing state-of-the-art 3D FCN segmentation models is that they
+often apply image patches of fixed size throughout training and testing, which
+may miss some complex tissue appearance patterns of different brain ROIs. To
+address this limitation, we trained a 3D FCN model for each ROI using patches
+of adaptive size and embedded outputs of the convolutional layers in the
+deconvolutional layers to further capture the local and global context
+patterns. In addition, with an introduction of multi-atlas based guidance in
+M-FCN, our segmentation was generated by combining the information of images
+and labels, which is highly robust. To reduce over-fitting of the FCN model on
+the training data, we adopted an ensemble strategy in the learning procedure.
+Evaluation was performed on two brain MRI datasets, aiming respectively at
+segmenting 14 subcortical and ventricular structures and 54 brain ROIs. The
+segmentation results of the proposed method were compared with those of a
+state-of-the-art multi-atlas based segmentation method and an existing 3D FCN
+segmentation model. Our results suggested that the proposed method had a
+superior segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Polarization Multi-Image Synthesis with Birefringent Metasurfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dean Hazineh, Soon Wei Daniel Lim, Qi Guo, Federico Capasso, Todd Zickler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical metasurfaces composed of precisely engineered nanostructures have
+gained significant attention for their ability to manipulate light and
+implement distinct functionalities based on the properties of the incident
+field. Computational imaging systems have started harnessing this capability to
+produce sets of coded measurements that benefit certain tasks when paired with
+digital post-processing. Inspired by these works, we introduce a new system
+that uses a birefringent metasurface with a polarizer-mosaicked photosensor to
+capture four optically-coded measurements in a single exposure. We apply this
+system to the task of incoherent opto-electronic filtering, where digital
+spatial-filtering operations are replaced by simpler, per-pixel sums across the
+four polarization channels, independent of the spatial filter size. In contrast
+to previous work on incoherent opto-electronic filtering that can realize only
+one spatial filter, our approach can realize a continuous family of filters
+from a single capture, with filters being selected from the family by adjusting
+the post-capture digital summation weights. To find a metasurface that can
+realize a set of user-specified spatial filters, we introduce a form of
+gradient descent with a novel regularizer that encourages light efficiency and
+a high signal-to-noise ratio. We demonstrate several examples in simulation and
+with fabricated prototypes, including some with spatial filters that have
+prescribed variations with respect to depth and wavelength.
+  Visit the Project Page at
+https://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Proceedings of the 2023 IEEE International
+  Conference of Computational Photography</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RALACs: Action Recognition in Autonomous Vehicles using Interaction
+  Encoding and Optical Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eddy Zhou, Alex Zhuang, Alikasim Budhwani, Rowan Dempster, Quanquan Li, Mohammad Al-Sharman, Derek Rayside, William Melek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied to autonomous vehicle (AV) settings, action recognition can
+enhance an environment model's situational awareness. This is especially
+prevalent in scenarios where traditional geometric descriptions and heuristics
+in AVs are insufficient. However, action recognition has traditionally been
+studied for humans, and its limited adaptability to noisy, un-clipped,
+un-pampered, raw RGB data has limited its application in other fields. To push
+for the advancement and adoption of action recognition into AVs, this work
+proposes a novel two-stage action recognition system, termed RALACs. RALACs
+formulates the problem of action recognition for road scenes, and bridges the
+gap between it and the established field of human action recognition. This work
+shows how attention layers can be useful for encoding the relations across
+agents, and stresses how such a scheme can be class-agnostic. Furthermore, to
+address the dynamic nature of agents on the road, RALACs constructs a novel
+approach to adapting Region of Interest (ROI) Alignment to agent tracks for
+downstream action classification. Finally, our scheme also considers the
+problem of active agent detection, and utilizes a novel application of fusing
+optical flow maps to discern relevant agents in a road scene. We show that our
+proposed scheme can outperform the baseline on the ICCV2021 Road Challenge
+dataset and by deploying it on a real vehicle platform, we provide preliminary
+insight to the usefulness of action recognition in decision making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tightly-coupled Visual-DVL-Inertial Odometry for Robot-based Ice-water
+  Boundary Exploration <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17005v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17005v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Zhao, Mingxi Zhou, Brice Loose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic underwater systems, e.g., Autonomous Underwater Vehicles (AUVs) and
+Remotely Operated Vehicles (ROVs), are promising tools for collecting
+biogeochemical data at the ice-water interface for scientific advancements.
+However, state estimation, i.e., localization, is a well-known problem for
+robotic systems, especially, for the ones that travel underwater. In this
+paper, we present a tightly-coupled multi-sensors fusion framework to increase
+localization accuracy that is robust to sensor failure. Visual images, Doppler
+Velocity Log (DVL), Inertial Measurement Unit (IMU) and Pressure sensor are
+integrated into the state-of-art Multi-State Constraint Kalman Filter (MSCKF)
+for state estimation. Besides that a new keyframe-based state clone mechanism
+and a new DVL-aided feature enhancement are presented to further improve the
+localization performance. The proposed method is validated with a data set
+collected in the field under frozen ice, and the result is compared with 6
+other different sensor fusion setups. Overall, the result with the keyframe
+enabled and DVL-aided feature enhancement yields the best performance with a
+Root-mean-square error of less than 2 m compared to the ground truth path with
+a total traveling distance of about 200 m.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inst-Inpaint: Instructing to Remove Objects with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03246v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03246v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Burak Yildirim, Vedat Baday, Erkut Erdem, Aykut Erdem, Aysegul Dundar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image inpainting task refers to erasing unwanted pixels from images and
+filling them in a semantically consistent and realistic way. Traditionally, the
+pixels that are wished to be erased are defined with binary masks. From the
+application point of view, a user needs to generate the masks for the objects
+they would like to remove which can be time-consuming and prone to errors. In
+this work, we are interested in an image inpainting algorithm that estimates
+which object to be removed based on natural language input and removes it,
+simultaneously. For this purpose, first, we construct a dataset named
+GQA-Inpaint for this task. Second, we present a novel inpainting framework,
+Inst-Inpaint, that can remove objects from images based on the instructions
+given as text prompts. We set various GAN and diffusion-based baselines and run
+experiments on synthetic and real image datasets. We compare methods with
+different evaluation metrics that measure the quality and accuracy of the
+models and show significant quantitative and qualitative improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Intents Graph Modeling for User-centric Group Discovery <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xixi Wu, Yun Xiong, Yao Zhang, Yizhu Jiao, Jiawei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online groups have become increasingly prevalent, providing users with space
+to share experiences and explore interests. Therefore, user-centric group
+discovery task, i.e., recommending groups to users can help both users' online
+experiences and platforms' long-term developments. Existing recommender methods
+can not deal with this task as modeling user-group participation into a
+bipartite graph overlooks their item-side interests. Although there exist a few
+works attempting to address this task, they still fall short in fully
+preserving the social context and ensuring effective interest representation
+learning.
+  In this paper, we focus on exploring the intents that motivate users to
+participate in groups, which can be categorized into different types, like the
+social-intent and the personal interest-intent. The former refers to users
+joining a group affected by their social links, while the latter relates to
+users joining groups with like-minded people for self-enjoyment. To comprehend
+different intents, we propose a novel model, DiRec, that first models each
+intent separately and then fuses them together for predictions. Specifically,
+for social-intent, we introduce the hypergraph structure to model the
+relationship between groups and members, leading to a richer understanding of
+the social context. As for interest-intent, we employ novel structural
+refinement on the interactive graph to uncover more intricate user behaviors
+and group interests, realizing better representation learning of interests.
+Furthermore, we also observe the intent overlapping in real-world scenarios and
+devise a novel self-supervised learning loss that encourages such alignment for
+final recommendations. Extensive experiments on three public datasets show the
+significant improvement of DiRec over the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM'23 as Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaMA-E: Empowering E-commerce Authoring with Multi-Aspect Instruction
+  Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaize Shi, Xueyao Sun, Dingxian Wang, Yinlin Fu, Guandong Xu, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  E-commerce authoring involves creating attractive, abundant, and targeted
+promotional content to drive product sales. The emergence of large language
+models (LLMs) introduces an innovative paradigm, offering a unified solution to
+address various authoring tasks within this scenario. However, mainstream LLMs
+trained on general corpora with common sense knowledge reveal limitations in
+fitting complex and personalized features unique to e-commerce products and
+customers. Furthermore, LLMs like GPT-3.5 necessitate remote accessibility,
+raising concerns about safeguarding voluminous customer privacy data during
+transmission. This paper proposes the LLaMA-E, the unified and customized
+instruction-following language models focusing on diverse e-commerce authoring
+tasks. Specifically, the domain experts create the seed instruction set from
+the tasks of ads generation, query-enhanced product title rewriting, product
+classification, purchase intent speculation, and general Q&A. These tasks
+enable the models to comprehensively understand precise e-commerce authoring
+knowledge by interleaving features covering typical service aspects of
+customers, sellers, and platforms. The GPT-3.5 is introduced as a teacher
+model, which expands the seed instructions to form a training set for the
+LLaMA-E models with various scales. The experimental results show that the
+proposed LLaMA-E models achieve state-of-the-art results in quantitative and
+qualitative evaluations, also exhibiting the advantage in zero-shot scenes. To
+the best of our knowledge, this study is the first to serve the LLMs to
+specific e-commerce authoring scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel Knowledge Enhancement based Framework for Multi-behavior
+  Recommendation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Meng, Chenhao Zhai, Yu Yang, Hengyu Zhang, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-behavior recommendation algorithms aim to leverage the multiplex
+interactions between users and items to learn users' latent preferences. Recent
+multi-behavior recommendation frameworks contain two steps: fusion and
+prediction. In the fusion step, advanced neural networks are used to model the
+hierarchical correlations between user behaviors. In the prediction step,
+multiple signals are utilized to jointly optimize the model with a multi-task
+learning (MTL) paradigm. However, recent approaches have not addressed the
+issue caused by imbalanced data distribution in the fusion step, resulting in
+the learned relationships being dominated by high-frequency behaviors. In the
+prediction step, the existing methods use a gate mechanism to directly
+aggregate expert information generated by coupling input, leading to negative
+information transfer. To tackle these issues, we propose a Parallel Knowledge
+Enhancement Framework (PKEF) for multi-behavior recommendation. Specifically,
+we enhance the hierarchical information propagation in the fusion step using
+parallel knowledge (PKF). Meanwhile, in the prediction step, we decouple the
+representations to generate expert information and introduce a projection
+mechanism during aggregation to eliminate gradient conflicts and alleviate
+negative transfer (PME). We conduct comprehensive experiments on three
+real-world datasets to validate the effectiveness of our model. The results
+further demonstrate the rationality and effectiveness of the designed PKF and
+PME modules. The source code and datasets are available at
+https://github.com/MC-CV/PKEF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiVa: An Iterative Framework to Harvest More Diverse and Valid Labels
+  from User Comments for Music <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongru Liang, Jingyao Liu, Yuanxin Xiang, Jiachen Du, Lanjun Zhou, Shushen Pan, Wenqiang Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Towards sufficient music searching, it is vital to form a complete set of
+labels for each song. However, current solutions fail to resolve it as they
+cannot produce diverse enough mappings to make up for the information missed by
+the gold labels. Based on the observation that such missing information may
+already be presented in user comments, we propose to study the automated music
+labeling in an essential but under-explored setting, where the model is
+required to harvest more diverse and valid labels from the users' comments
+given limited gold labels. To this end, we design an iterative framework (DiVa)
+to harvest more $\underline{\text{Di}}$verse and $\underline{\text{Va}}$lid
+labels from user comments for music. The framework makes a classifier able to
+form complete sets of labels for songs via pseudo-labels inferred from
+pre-trained classifiers and a novel joint score function. The experiment on a
+densely annotated testing set reveals the superiority of the Diva over
+state-of-the-art solutions in producing more diverse labels missed by the gold
+labels. We hope our work can inspire future research on automated music
+labeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, published to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entire Space Cascade Delayed Feedback Modeling for Effective Conversion
+  Rate Prediction <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfeng Zhao, Xu Yan, Xiaoqiang Gui, Shuguang Han, Xiang-Rong Sheng, Guoxian Yu, Jufeng Chen, Zhao Xu, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversion rate (CVR) prediction is an essential task for large-scale
+e-commerce platforms. However, refund behaviors frequently occur after
+conversion in online shopping systems, which drives us to pay attention to
+effective conversion for building healthier shopping services. This paper
+defines the probability of item purchasing without any subsequent refund as an
+effective conversion rate (ECVR). A simple paradigm for ECVR prediction is to
+decompose it into two sub-tasks: CVR prediction and post-conversion refund rate
+(RFR) prediction. However, RFR prediction suffers from data sparsity (DS) and
+sample selection bias (SSB) issues, as the refund behaviors are only available
+after user purchase. Furthermore, there is delayed feedback in both conversion
+and refund events and they are sequentially dependent, named cascade delayed
+feedback (CDF), which significantly harms data freshness for model training.
+Previous studies mainly focus on tackling DS and SSB or delayed feedback for a
+single event. To jointly tackle these issues in ECVR prediction, we propose an
+Entire space CAscade Delayed feedback modeling (ECAD) method. Specifically,
+ECAD deals with DS and SSB by constructing two tasks including CVR prediction
+and conversion \& refund rate (CVRFR) prediction using the entire space
+modeling framework. In addition, it carefully schedules auxiliary tasks to
+leverage both conversion and refund time within data to alleviate CDF.
+Experimental results on the offline industrial dataset and online A/B testing
+demonstrate the effectiveness of ECAD. In addition, ECAD has been deployed in
+one of the recommender systems in Alibaba, contributing to a significant
+improvement of ECVR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Interpretable and Reliable Open Information Retriever for New
+  Domains Overnight <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodong Yu, Ben Zhou, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information retrieval (IR) or knowledge retrieval, is a critical component
+for many down-stream tasks such as open-domain question answering (QA). It is
+also very challenging, as it requires succinctness, completeness, and
+correctness. In recent works, dense retrieval models have achieved
+state-of-the-art (SOTA) performance on in-domain IR and QA benchmarks by
+representing queries and knowledge passages with dense vectors and learning the
+lexical and semantic similarity. However, using single dense vectors and
+end-to-end supervision are not always optimal because queries may require
+attention to multiple aspects and event implicit knowledge. In this work, we
+propose an information retrieval pipeline that uses entity/event linking model
+and query decomposition model to focus more accurately on different information
+units of the query. We show that, while being more interpretable and reliable,
+our proposed pipeline significantly improves passage coverages and denotation
+accuracies across five IR and QA benchmarks. It will be the go-to system to use
+for applications that need to perform IR on a new domain without much dedicated
+effort, because of its superior interpretability and cross-domain performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submission of ACL 2023. Rejected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Learning of Rotation-invariant 3D Point Set Features
+  using <span class="highlight-title">Transformer</span> and its Self-distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takahiko Furuya, Zhoujie Chen, Ryutarou Ohbuchi, Zhenzhong Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Invariance against rotations of 3D objects is an important property in
+analyzing 3D point set data. Conventional 3D point set DNNs having rotation
+invariance typically obtain accurate 3D shape features via supervised learning
+by using labeled 3D point sets as training samples. However, due to the rapid
+increase in 3D point set data and the high cost of labeling, a framework to
+learn rotation-invariant 3D shape features from numerous unlabeled 3D point
+sets is required. This paper proposes a novel self-supervised learning
+framework for acquiring accurate and rotation-invariant 3D point set features
+at object-level. Our proposed lightweight DNN architecture decomposes an input
+3D point set into multiple global-scale regions, called tokens, that preserve
+the spatial layout of partial shapes composing the 3D object. We employ a
+self-attention mechanism to refine the tokens and aggregate them into an
+expressive rotation-invariant feature per 3D point set. Our DNN is effectively
+trained by using pseudo-labels generated by a self-distillation framework. To
+facilitate the learning of accurate features, we propose to combine multi-crop
+and cut-mix data augmentation techniques to diversify 3D point sets for
+training. Through a comprehensive evaluation, we empirically demonstrate that,
+(1) existing rotation-invariant DNN architectures designed for supervised
+learning do not necessarily learn accurate 3D shape features under a
+self-supervised learning scenario, and (2) our proposed algorithm learns
+rotation-invariant 3D point set features that are more accurate than those
+learned by existing algorithms. Code will be available at
+https://github.com/takahikof/RIPT_SDMM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pareto Invariant Representation Learning for Multimedia Recommendation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Huang, Haoxuan Li, Qingsong Li, Chunyuan Zheng, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommendation involves personalized ranking tasks, where
+multimedia content is usually represented using a generic encoder. However,
+these generic representations introduce spurious correlations that fail to
+reveal users' true preferences. Existing works attempt to alleviate this
+problem by learning invariant representations, but overlook the balance between
+independent and identically distributed (IID) and out-of-distribution (OOD)
+generalization. In this paper, we propose a framework called Pareto Invariant
+Representation Learning (PaInvRL) to mitigate the impact of spurious
+correlations from an IID-OOD multi-objective optimization perspective, by
+learning invariant representations (intrinsic factors that attract user
+attention) and variant representations (other factors) simultaneously.
+Specifically, PaInvRL includes three iteratively executed modules: (i)
+heterogeneous identification module, which identifies the heterogeneous
+environments to reflect distributional shifts for user-item interactions; (ii)
+invariant mask generation module, which learns invariant masks based on the
+Pareto-optimal solutions that minimize the adaptive weighted Invariant Risk
+Minimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which
+generates both variant representations and item-invariant representations for
+training a multi-modal recommendation model that mitigates spurious
+correlations and balances the generalization performance within and cross the
+environmental distributions. We compare the proposed PaInvRL with
+state-of-the-art recommendation models on three public multimedia
+recommendation datasets (Movielens, Tiktok, and Kwai), and the experimental
+results validate the effectiveness of PaInvRL for both within- and
+cross-environmental learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating and Optimizing the Effectiveness of Neural Machine
+  Translation in Supporting Code Retrieval Models: A Study on the CAT Benchmark <span class="chip">CIKM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Phan, Ali Jannesari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Machine Translation (NMT) is widely applied in software engineering
+tasks. The effectiveness of NMT for code retrieval relies on the ability to
+learn from the sequence of tokens in the source language to the sequence of
+tokens in the target language. While NMT performs well in pseudocode-to-code
+translation, it might have challenges in learning to translate from natural
+language query to source code in newly curated real-world code documentation/
+implementation datasets. In this work, we analyze the performance of NMT in
+natural language-to-code translation in the newly curated CAT benchmark that
+includes the optimized versions of three Java datasets TLCodeSum,
+CodeSearchNet, Funcom, and a Python dataset PCSD. Our evaluation shows that NMT
+has low accuracy, measured by CrystalBLEU and Meteor metrics in this task. To
+alleviate the duty of NMT in learning complex representation of source code, we
+propose ASTTrans Representation, a tailored representation of an Abstract
+Syntax Tree (AST) using a subset of non-terminal nodes. We show that the
+classical approach NMT performs significantly better in learning ASTTrans
+Representation over code tokens with up to 36% improvement on Meteor score.
+Moreover, we leverage ASTTrans Representation to conduct combined code search
+processes from the state-of-the-art code search processes using GraphCodeBERT
+and UniXcoder. Our NMT models of learning ASTTrans Representation can boost the
+Mean Reciprocal Rank of these state-of-the-art code search processes by up to
+3.08% and improve 23.08% of queries' results over the CAT benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as Full Paper in Proceedings of the 32nd ACM International
+  Conference on Information and Knowledge Management (CIKM), Birmingham, UK,
+  October 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ web crawler strategies for web pages under robot.txt restriction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyush Vyas, Akhilesh Chauhan, Tushar Mandge, Surbhi Hardikar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the present time, all know about World Wide Web and work over the Internet
+daily. In this paper, we introduce the search engines working for keywords that
+are entered by users to find something. The search engine uses different search
+algorithms for convenient results for providing to the net surfer. Net surfers
+go with the top search results but how did the results of web pages get higher
+ranks over search engines? how the search engine got that all the web pages in
+the database? This paper gives the answers to all these kinds of basic
+questions. Web crawlers working for search engines and robot exclusion protocol
+rules for web crawlers are also addressed in this research paper. Webmaster
+uses different restriction facts in robot.txt file to instruct web crawler,
+some basic formats of robot.txt are also mentioned in this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Matrix Factorization with Dynamic Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangde Gao, Ke Liu, Yichao Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrix factorization (MF) is a classical collaborative filtering algorithm
+for recommender systems. It decomposes the user-item interaction matrix into a
+product of low-dimensional user representation matrix and item representation
+matrix. In typical recommendation scenarios, the user-item interaction paradigm
+is usually a two-stage process and requires static clustering analysis of the
+obtained user and item representations. The above process, however, is time and
+computationally intensive, making it difficult to apply in real-time to
+e-commerce or Internet of Things environments with billions of users and
+trillions of items. To address this, we propose a unified matrix factorization
+method based on dynamic multi-view clustering (MFDMC) that employs an
+end-to-end training paradigm. Specifically, in each view, a user/item
+representation is regarded as a weighted projection of all clusters. The
+representation of each cluster is learnable, enabling the dynamic discarding of
+bad clusters. Furthermore, we employ multi-view clustering to represent
+multiple roles of users/items, effectively utilizing the representation space
+and improving the interpretability of the user/item representations for
+downstream tasks. Extensive experiments show that our proposed MFDMC achieves
+state-of-the-art performance on real-world recommendation datasets.
+Additionally, comprehensive visualization and ablation studies interpretably
+confirm that our method provides meaningful representations for downstream
+tasks of users/items.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT4Rec: Simple yet Effective Consistency Training for Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.06668v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.06668v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Liu, Xiaoyang Liu, Rongqin Zheng, Lixin Zhang, Xiaobo Liang, Juntao Li, Lijun Wu, Min Zhang, Leyu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation methods are increasingly important in cutting-edge
+recommender systems. Through leveraging historical records, the systems can
+capture user interests and perform recommendations accordingly.
+State-of-the-art sequential recommendation models proposed very recently
+combine contrastive learning techniques for obtaining high-quality user
+representations. Though effective and performing well, the models based on
+contrastive learning require careful selection of data augmentation methods and
+pretext tasks, efficient negative sampling strategies, and massive
+hyper-parameters validation. In this paper, we propose an ultra-simple
+alternative for obtaining better user representations and improving sequential
+recommendation performance. Specifically, we present a simple yet effective
+\textbf{C}onsistency \textbf{T}raining method for sequential
+\textbf{Rec}ommendation (CT4Rec) in which only two extra training objectives
+are utilized without any structural modifications and data augmentation.
+Experiments on three benchmark datasets and one large newly crawled industrial
+corpus demonstrate that our proposed method outperforms SOTA models by a large
+margin and with much less training time than these based on contrastive
+learning. Online evaluation on real-world content recommendation system also
+achieves 2.717\% improvement on the click-through rate and 3.679\% increase on
+the average click number per capita. Further exploration reveals that such a
+simple method has great potential for CTR prediction. Our code is available at
+\url{https://github.com/ct4rec/CT4Rec.git}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu, Laura Forastiere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mobile Supply: The Last Piece of Jigsaw of Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03855v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03855v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhao Jiang, Biao Zeng, Hao Feng, Jin Liu, Jie Zhang, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system is a fundamental functionality of online platforms.
+With the development of computing power of mobile phones, some researchers have
+deployed recommendation algorithms on users' mobile devices to address the
+problems of data transmission delay and pagination trigger mechanism. However,
+the existing edge-side mobile rankings cannot completely solve the problem of
+pagination trigger mechanism. The mobile ranking can only sort the items on the
+current page, and the fixed set of candidate items limits the performance of
+the mobile ranking. Besides, after the user has viewed the items of interest to
+the user on the current page, the user refresh to get a new page of items. This
+will affect the user's immersive experience because the user is not satisfied
+with the left items on the current page. In order to address the problem of
+pagination trigger mechanism, we propose a completely new module in the
+pipeline of recommender system named Mobile Supply. The pipeline of recommender
+system is extended to "retrival->pre-ranking->ranking->re-ranking->Mobile
+Supply->mobile ranking". Specifically, we introduce the concept of list value
+and use point-wise paradigm to approximate list-wise estimation to calculate
+the maximum revenue that can be achieved by mobile ranking for the current
+page. We also design a new mobile ranking approach named device-aware mobile
+ranking considering the differences of mobile devices tailored to the new
+pipeline. Extensive offline and online experiments show the superiority of our
+proposed method and prove that Mobile Supply can further improve the
+performance of edge-side recommender system and user experience. Mobile Supply
+has been deployed on the homepage of a large-scale online food platform and has
+yielded considerable profits in our business.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Foundation Models for Information Synthesis of Wireless
+  Communication Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04033v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04033v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manikanta Kotaru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches to understanding, developing and researching modern
+wireless communication technologies involves time-intensive and arduous process
+of sifting through numerous webpages and technical specification documents,
+gathering the required information and synthesizing it. This paper presents
+NextGen Communications Copilot, a conversational artificial intelligence tool
+for information synthesis of wireless communication specifications. The system
+builds on top of recent advancements in foundation models and consists of three
+key additional components: a domain-specific database, a context extractor, and
+a feedback mechanism. The system appends user queries with concise and
+query-dependent contextual information extracted from a database of wireless
+technical specifications and incorporates tools for expert feedback and data
+contributions. On evaluation using a benchmark dataset of queries and reference
+responses created by subject matter experts, the system demonstrated more
+relevant and accurate answers with an average BLEU score and BERTScore
+F1-measure of 0.37 and 0.79 respectively compared to the corresponding values
+of 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">131</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Multi-Shot Diffusion-Weighted MRI with Zero-Shot
+  <span class="highlight-title">Self-Supervised</span> Learning Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaejin Cho, Yohan Jun, Xiaoqing Wang, Caique Kobayashi, Berkin Bilgic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion MRI is commonly performed using echo-planar imaging (EPI) due to
+its rapid acquisition time. However, the resolution of diffusion-weighted
+images is often limited by magnetic field inhomogeneity-related artifacts and
+blurring induced by T2- and T2*-relaxation effects. To address these
+limitations, multi-shot EPI (msEPI) combined with parallel imaging techniques
+is frequently employed. Nevertheless, reconstructing msEPI can be challenging
+due to phase variation between multiple shots. In this study, we introduce a
+novel msEPI reconstruction approach called zero-MIRID (zero-shot
+self-supervised learning of Multi-shot Image Reconstruction for Improved
+Diffusion MRI). This method jointly reconstructs msEPI data by incorporating
+deep learning-based image regularization techniques. The network incorporates
+CNN denoisers in both k- and image-spaces, while leveraging virtual coils to
+enhance image reconstruction conditioning. By employing a self-supervised
+learning technique and dividing sampled data into three groups, the proposed
+approach achieves superior results compared to the state-of-the-art parallel
+imaging method, as demonstrated in an in-vivo experiment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOST -- Domain Obedient <span class="highlight-title">Self-supervised</span> Training for Multi Label
+  Classification with Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumadeep Saha, Utpal Garain, Arijit Ukil, Arpan Pal, Sundeep Khandelwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enormous demand for annotated data brought forth by deep learning
+techniques has been accompanied by the problem of annotation noise. Although
+this issue has been widely discussed in machine learning literature, it has
+been relatively unexplored in the context of "multi-label classification" (MLC)
+tasks which feature more complicated kinds of noise. Additionally, when the
+domain in question has certain logical constraints, noisy annotations often
+exacerbate their violations, making such a system unacceptable to an expert.
+This paper studies the effect of label noise on domain rule violation incidents
+in the MLC task, and incorporates domain rules into our learning algorithm to
+mitigate the effect of noise. We propose the Domain Obedient Self-supervised
+Training (DOST) paradigm which not only makes deep learning models more aligned
+to domain rules, but also improves learning performance in key metrics and
+minimizes the effect of annotation noise. This novel approach uses domain
+guidance to detect offending annotations and deter rule-violating predictions
+in a self-supervised manner, thus making it more "data efficient" and domain
+compliant. Empirical studies, performed over two large scale multi-label
+classification datasets, demonstrate that our method results in improvement
+across the board, and often entirely counteracts the effect of noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE TNNLS on March 7th 2023. 8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A degree of image identification at sub-human scales could be possible
+  with more advanced clusters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Y J
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of the research is to determine if currently available
+self-supervised learning techniques can accomplish human level comprehension of
+visual images using the same degree and amount of sensory input that people
+acquire from. Initial research on this topic solely considered data volume
+scaling. Here, we scale both the volume of data and the quality of the image.
+This scaling experiment is a self-supervised learning method that may be done
+without any outside financing. We find that scaling up data volume and picture
+resolution at the same time enables human-level item detection performance at
+sub-human sizes.We run a scaling experiment with vision transformers trained on
+up to 200000 images up to 256 ppi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, public code and model:
+  https://github.com/PrateekJannu/imagescale2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Inverse Transition Learning for Offline Settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leo Benac, Sonali Parbhoo, Finale Doshi-Velez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline Reinforcement learning is commonly used for sequential
+decision-making in domains such as healthcare and education, where the rewards
+are known and the transition dynamics $T$ must be estimated on the basis of
+batch data. A key challenge for all tasks is how to learn a reliable estimate
+of the transition dynamics $T$ that produce near-optimal policies that are safe
+enough so that they never take actions that are far away from the best action
+with respect to their value functions and informative enough so that they
+communicate the uncertainties they have. Using data from an expert, we propose
+a new constraint-based approach that captures our desiderata for reliably
+learning a posterior distribution of the transition dynamics $T$ that is free
+from gradients. Our results demonstrate that by using our constraints, we learn
+a high-performing policy, while considerably reducing the policy's variance
+over different datasets. We also explain how combining uncertainty estimation
+with these constraints can help us infer a partial ranking of actions that
+produce higher returns, and helps us infer safer and more informative policies
+for planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 plots, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ing In-Context Operator Learning with Sensor Data, Equations, and
+  Natural Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yang, Tingwei Meng, Siting Liu, Stanley J. Osher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the growing domain of scientific machine learning, in-context operator
+learning has demonstrated notable potential in learning operators from prompted
+data during inference stage without weight updates. However, the current
+model's overdependence on sensor data, may inadvertently overlook the
+invaluable human insight into the operator. To address this, we present a
+transformation of in-context operator learning into a multi-modal paradigm. We
+propose the use of "captions" to integrate human knowledge about the operator,
+expressed through natural language descriptions and equations. We illustrate
+how this method not only broadens the flexibility and generality of
+physics-informed learning, but also significantly boosts learning performance
+and reduces data needs. Furthermore, we introduce a more efficient neural
+network architecture for multi-modal in-context operator learning, referred to
+as "ICON-LM", based on a language-model-like architecture. We demonstrate the
+viability of "ICON-LM" for scientific machine learning tasks, which creates a
+new path for the application of language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Method for improving accuracy in neural network by reinstating
+  traditional back propagation technique 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gokulprasath R
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has revolutionized industries like computer vision, natural
+language processing, and speech recognition. However, back propagation, the
+main method for training deep neural networks, faces challenges like
+computational overhead and vanishing gradients. In this paper, we propose a
+novel instant parameter update methodology that eliminates the need for
+computing gradients at each layer. Our approach accelerates learning, avoids
+the vanishing gradient problem, and outperforms state-of-the-art methods on
+benchmark data sets. This research presents a promising direction for efficient
+and effective deep neural network training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadGraph2: Modeling Disease Progression in Radiology Reports via
+  Hierarchical Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameer Khanna, Adam Dejl, Kibo Yoon, Quoc Hung Truong, Hanh Duong, Agustina Saenz, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RadGraph2, a novel dataset for extracting information from
+radiology reports that focuses on capturing changes in disease state and device
+placement over time. We introduce a hierarchical schema that organizes entities
+based on their relationships and show that using this hierarchy during training
+improves the performance of an information extraction model. Specifically, we
+propose a modification to the DyGIE++ framework, resulting in our model HGIE,
+which outperforms previous models in entity and relation extraction tasks. We
+demonstrate that RadGraph2 enables models to capture a wider variety of
+findings and perform better at relation extraction compared to those trained on
+the original RadGraph dataset. Our work provides the foundation for developing
+automated systems that can track disease progression over time and develop
+information extraction models that leverage the natural hierarchy of labels in
+the medical domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Machine Learning for Healthcare 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Wideband Spectrum Sensing and Scheduling for Networked
+  UAVs in UTM Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sravan Reddy Chintareddy, Keenan Roach, Kenny Cheung, Morteza Hashemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a data-driven framework for collaborative wideband
+spectrum sensing and scheduling for networked unmanned aerial vehicles (UAVs),
+which act as the secondary users to opportunistically utilize detected spectrum
+holes. To this end, we propose a multi-class classification problem for
+wideband spectrum sensing to detect vacant spectrum spots based on collected
+I/Q samples. To enhance the accuracy of the spectrum sensing module, the
+outputs from the multi-class classification by each individual UAV are fused at
+a server in the unmanned aircraft system traffic management (UTM) ecosystem. In
+the spectrum scheduling phase, we leverage reinforcement learning (RL)
+solutions to dynamically allocate the detected spectrum holes to the secondary
+users (i.e., UAVs). To evaluate the proposed methods, we establish a
+comprehensive simulation framework that generates a near-realistic synthetic
+dataset using MATLAB LTE toolbox by incorporating base-station~(BS) locations
+in a chosen area of interest, performing ray-tracing, and emulating the primary
+users channel usage in terms of I/Q samples. This evaluation methodology
+provides a flexible framework to generate large spectrum datasets that could be
+used for developing ML/AI-based spectrum management solutions for aerial
+devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kairos: : Practical Intrusion Detection and Investigation using
+  Whole-system Provenance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijun Cheng, Qiujian Lv, Jinyuan Liang, Yan Wang, Degang Sun, Thomas Pasquier, Xueyuan Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Provenance graphs are structured audit logs that describe the history of a
+system's execution. Recent studies have explored a variety of techniques to
+analyze provenance graphs for automated host intrusion detection, focusing
+particularly on advanced persistent threats. Sifting through their design
+documents, we identify four common dimensions that drive the development of
+provenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect
+modern attacks that infiltrate across application boundaries?), attack
+agnosticity (can PIDSes detect novel attacks without a priori knowledge of
+attack characteristics?), timeliness (can PIDSes efficiently monitor host
+systems as they run?), and attack reconstruction (can PIDSes distill attack
+activity from large provenance graphs so that sysadmins can easily understand
+and quickly respond to system intrusion?). We present KAIROS, the first PIDS
+that simultaneously satisfies the desiderata in all four dimensions, whereas
+existing approaches sacrifice at least one and struggle to achieve comparable
+detection performance.
+  Kairos leverages a novel graph neural network-based encoder-decoder
+architecture that learns the temporal evolution of a provenance graph's
+structural changes to quantify the degree of anomalousness for each system
+event. Then, based on this fine-grained information, Kairos reconstructs attack
+footprints, generating compact summary graphs that accurately describe
+malicious activity over a stream of system audit logs. Using state-of-the-art
+benchmark datasets, we demonstrate that Kairos outperforms previous approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 16 figures, to appear in the 45th IEEE Symposium on
+  Security and Privacy (S&P'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Density Crop-guided Semi-supervised Object Detection in Aerial Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil Meethal, Eric Granger, Marco Pedersoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the important bottlenecks in training modern object detectors is the
+need for labeled images where bounding box annotations have to be produced for
+each object present in the image. This bottleneck is further exacerbated in
+aerial images where the annotators have to label small objects often
+distributed in clusters on high-resolution images. In recent days, the
+mean-teacher approach trained with pseudo-labels and weak-strong augmentation
+consistency is gaining popularity for semi-supervised object detection.
+However, a direct adaptation of such semi-supervised detectors for aerial
+images where small clustered objects are often present, might not lead to
+optimal results. In this paper, we propose a density crop-guided
+semi-supervised detector that identifies the cluster of small objects during
+training and also exploits them to improve performance at inference. During
+training, image crops of clusters identified from labeled and unlabeled images
+are used to augment the training set, which in turn increases the chance of
+detecting small objects and creating good pseudo-labels for small objects on
+the unlabeled images. During inference, the detector is not only able to detect
+the objects of interest but also regions with a high density of small objects
+(density crops) so that detections from the input image and detections from
+image crops are combined, resulting in an overall more accurate object
+prediction, especially for small objects. Empirical studies on the popular
+benchmarks of VisDrone and DOTA datasets show the effectiveness of our density
+crop-guided semi-supervised detector with an average improvement of more than
+2\% over the basic mean-teacher method in COCO style AP. Our code is available
+at: https://github.com/akhilpm/DroneSSOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study on Using Large Language Models to Analyze Software
+  Supply Chain Security Failures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Singla, Dharun Anandayuvaraj, Kelechi G. Kalu, Taylor R. Schorlemmer, James C. Davis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As we increasingly depend on software systems, the consequences of breaches
+in the software supply chain become more severe. High-profile cyber attacks
+like those on SolarWinds and ShadowHammer have resulted in significant
+financial and data losses, underlining the need for stronger cybersecurity. One
+way to prevent future breaches is by studying past failures. However,
+traditional methods of analyzing these failures require manually reading and
+summarizing reports about them. Automated support could reduce costs and allow
+analysis of more failures. Natural Language Processing (NLP) techniques such as
+Large Language Models (LLMs) could be leveraged to assist the analysis of
+failures. In this study, we assessed the ability of Large Language Models
+(LLMs) to analyze historical software supply chain breaches. We used LLMs to
+replicate the manual analysis of 69 software supply chain security failures
+performed by members of the Cloud Native Computing Foundation (CNCF). We
+developed prompts for LLMs to categorize these by four dimensions: type of
+compromise, intent, nature, and impact. GPT 3.5s categorizations had an average
+accuracy of 68% and Bard had an accuracy of 58% over these dimensions. We
+report that LLMs effectively characterize software supply chain failures when
+the source articles are detailed enough for consensus among manual analysts,
+but cannot yet replace human analysts. Future work can improve LLM performance
+in this context, and study a broader range of articles and failures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Diffusion Models Suffer Error Propagation? Theoretical Analysis and
+  Consistency Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangming Li, Zhaozhi Qian, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While diffusion models have achieved promising performances in data
+synthesis, they might suffer error propagation because of their cascade
+structure, where the distributional mismatch spreads and magnifies through the
+chain of denoising modules. However, a strict analysis is expected since many
+sequential models such as Conditional Random Field (CRF) are free from error
+propagation. In this paper, we empirically and theoretically verify that
+diffusion models are indeed affected by error propagation and we then propose a
+regularization to address this problem. Our theoretical analysis reveals that
+the question can be reduced to whether every denoising module of the diffusion
+model is fault-tolerant. We derive insightful transition equations, indicating
+that the module can't recover from input errors and even propagates additional
+errors to the next module. Our analysis directly leads to a consistency
+regularization scheme for diffusion models, which explicitly reduces the
+distribution gap between forward and backward processes. We further introduce a
+bootstrapping algorithm to reduce the computation cost of the regularizer. Our
+experimental results on multiple image datasets show that our regularization
+effectively handles error propagation and significantly improves the
+performance of vanilla diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When and How Does Known Class Help Discover Unknown Ones? Provable
+  Understanding Through Spectral Analysis <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyou Sun, Zhenmei Shi, Yingyu Liang, Yixuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novel Class Discovery (NCD) aims at inferring novel classes in an unlabeled
+set by leveraging prior knowledge from a labeled set with known classes.
+Despite its importance, there is a lack of theoretical foundations for NCD.
+This paper bridges the gap by providing an analytical framework to formalize
+and investigate when and how known classes can help discover novel classes.
+Tailored to the NCD problem, we introduce a graph-theoretic representation that
+can be learned by a novel NCD Spectral Contrastive Loss (NSCL). Minimizing this
+objective is equivalent to factorizing the graph's adjacency matrix, which
+allows us to derive a provable error bound and provide the sufficient and
+necessary condition for NCD. Empirically, NSCL can match or outperform several
+strong baselines on common benchmark datasets, which is appealing for practical
+usage while enjoying theoretical guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Bugs in Open-Source Federated Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Shao, Yuyang Gao, Fu Song, Sen Chen, Lingling Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL), as a decentralized machine learning solution to the
+protection of users' private data, has become an important learning paradigm in
+recent years, especially since the enforcement of stricter laws and regulations
+in most countries. Therefore, a variety of FL frameworks are released to
+facilitate the development and application of federated learning. Despite the
+considerable amount of research on the security and privacy of FL models and
+systems, the security issues in FL frameworks have not been systematically
+studied yet. In this paper, we conduct the first empirical study on 1,112 FL
+framework bugs to investigate their characteristics. These bugs are manually
+collected, classified, and labeled from 12 open-source FL frameworks on GitHub.
+In detail, we construct taxonomies of 15 symptoms, 12 root causes, and 20 fix
+patterns of these bugs and investigate their correlations and distributions on
+23 logical components and two main application scenarios. From the results of
+our study, we present nine findings, discuss their implications, and propound
+several suggestions to FL framework developers and security researchers on the
+FL frameworks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with
+  Distinct Inlier Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pérez-Carrasco Manuel, Cabrera-Vives Guillermo, Hernández-García Lorena, Forster Francisco, Sánchez-Sáez Paula, Muñoz Arancibia Alejandra, Astorga Nicolás, Bauer Franz, Bayo Amelia, Cádiz-Leyton Martina, Catelan Marcio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing volume of astronomical data generated by modern survey
+telescopes, automated pipelines and machine learning techniques have become
+crucial for analyzing and extracting knowledge from these datasets. Anomaly
+detection, i.e. the task of identifying irregular or unexpected patterns in the
+data, is a complex challenge in astronomy. In this paper, we propose
+Multi-Class Deep Support Vector Data Description (MCDSVDD), an extension of the
+state-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically
+designed to handle different inlier categories with distinct data
+distributions. MCDSVDD uses a neural network to map the data into hyperspheres,
+where each hypersphere represents a specific inlier category. The distance of
+each sample from the centers of these hyperspheres determines the anomaly
+score. We evaluate the effectiveness of MCDSVDD by comparing its performance
+with several anomaly detection algorithms on a large dataset of astronomical
+light-curves obtained from the Zwicky Transient Facility. Our results
+demonstrate the efficacy of MCDSVDD in detecting anomalous sources while
+leveraging the presence of different inlier categories. The code and the data
+needed to reproduce our results are publicly available at
+https://github.com/mperezcarrasco/AnomalyALeRCE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transferable Models for Bioacoustics with Human Language Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Robinson, Adelaide Robinson, Lily Akrapongpisak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Passive acoustic monitoring offers a scalable, non-invasive method for
+tracking global biodiversity and anthropogenic impacts on species. Although
+deep learning has become a vital tool for processing this data, current models
+are inflexible, typically cover only a handful of species, and are limited by
+data scarcity. In this work, we propose BioLingual, a new model for
+bioacoustics based on contrastive language-audio pretraining. We first
+aggregate bioacoustic archives into a language-audio dataset, called
+AnimalSpeak, with over a million audio-caption pairs holding information on
+species, vocalization context, and animal behavior. After training on this
+dataset to connect language and audio representations, our model can identify
+over a thousand species' calls across taxa, complete bioacoustic tasks
+zero-shot, and retrieve animal vocalization recordings from natural text
+queries. When fine-tuned, BioLingual sets a new state-of-the-art on nine tasks
+in the Benchmark of Animal Sounds. Given its broad taxa coverage and ability to
+be flexibly queried in human language, we believe this model opens new
+paradigms in ecological monitoring and research, including free-text search on
+the world's acoustic monitoring archives. We open-source our models, dataset,
+and code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial ModSecurity: Countering Adversarial SQL Injections with
+  Robust Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Biagio Montaruli, Luca Demetrio, Andrea Valenza, Battista Biggio, Luca Compagna, Davide Balzarotti, Davide Ariu, Luca Piras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ModSecurity is widely recognized as the standard open-source Web Application
+Firewall (WAF), maintained by the OWASP Foundation. It detects malicious
+requests by matching them against the Core Rule Set, identifying well-known
+attack patterns. Each rule in the CRS is manually assigned a weight, based on
+the severity of the corresponding attack, and a request is detected as
+malicious if the sum of the weights of the firing rules exceeds a given
+threshold. In this work, we show that this simple strategy is largely
+ineffective for detecting SQL injection (SQLi) attacks, as it tends to block
+many legitimate requests, while also being vulnerable to adversarial SQLi
+attacks, i.e., attacks intentionally manipulated to evade detection. To
+overcome these issues, we design a robust machine learning model, named
+AdvModSec, which uses the CRS rules as input features, and it is trained to
+detect adversarial SQLi attacks. Our experiments show that AdvModSec, being
+trained on the traffic directed towards the protected web services, achieves a
+better trade-off between detection and false positive rates, improving the
+detection rate of the vanilla version of ModSecurity with CRS by 21%. Moreover,
+our approach is able to improve its adversarial robustness against adversarial
+SQLi attacks by 42%, thereby taking a step forward towards building more robust
+and trustworthy WAFs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CasCIFF: A Cross-Domain Information Fusion Framework Tailored for
+  Cascade Prediction in Social Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjun Zhu, Shun Yuan, Xin Liu, Kuo Chen, Chaolong Jia, Ying Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches for information cascade prediction fall into three main
+categories: feature-driven methods, point process-based methods, and deep
+learning-based methods. Among them, deep learning-based methods, characterized
+by its superior learning and representation capabilities, mitigates the
+shortcomings inherent of the other methods. However, current deep learning
+methods still face several persistent challenges. In particular, accurate
+representation of user attributes remains problematic due to factors such as
+fake followers and complex network configurations. Previous algorithms that
+focus on the sequential order of user activations often neglect the rich
+insights offered by activation timing. Furthermore, these techniques often fail
+to holistically integrate temporal and structural aspects, thus missing the
+nuanced propagation trends inherent in information cascades.To address these
+issues, we propose the Cross-Domain Information Fusion Framework (CasCIFF),
+which is tailored for information cascade prediction. This framework exploits
+multi-hop neighborhood information to make user embeddings robust. When
+embedding cascades, the framework intentionally incorporates timestamps,
+endowing it with the ability to capture evolving patterns of information
+diffusion. In particular, the CasCIFF seamlessly integrates the tasks of user
+classification and cascade prediction into a consolidated framework, thereby
+allowing the extraction of common features that prove useful for all tasks, a
+strategy anchored in the principles of multi-task learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representation Learning for Audio Privacy Preservation using Source
+  Separation and Robust Adversarial Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diep Luong, Minh Tran, Shayan Gharib, Konstantinos Drossos, Tuomas Virtanen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy preservation has long been a concern in smart acoustic monitoring
+systems, where speech can be passively recorded along with a target signal in
+the system's operating environment. In this study, we propose the integration
+of two commonly used approaches in privacy preservation: source separation and
+adversarial representation learning. The proposed system learns the latent
+representation of audio recordings such that it prevents differentiating
+between speech and non-speech recordings. Initially, the source separation
+network filters out some of the privacy-sensitive data, and during the
+adversarial learning process, the system will learn privacy-preserving
+representation on the filtered signal. We demonstrate the effectiveness of our
+proposed method by comparing our method against systems without source
+separation, without adversarial learning, and without both. Overall, our
+results suggest that the proposed system can significantly improve speech
+privacy preservation compared to that of using source separation or adversarial
+learning solely while maintaining good performance in the acoustic monitoring
+task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Autonomous Separation Assurance through Distributed
+  Reinforcement Learning with Attention Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc W. Brittain, Luis E. Alvarez, Kara Breeden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced Air Mobility (AAM) introduces a new, efficient mode of
+transportation with the use of vehicle autonomy and electrified aircraft to
+provide increasingly autonomous transportation between previously underserved
+markets. Safe and efficient navigation of low altitude aircraft through highly
+dense environments requires the integration of a multitude of complex
+observations, such as surveillance, knowledge of vehicle dynamics, and weather.
+The processing and reasoning on these observations pose challenges due to the
+various sources of uncertainty in the information while ensuring cooperation
+with a variable number of aircraft in the airspace. These challenges coupled
+with the requirement to make safety-critical decisions in real-time rule out
+the use of conventional separation assurance techniques. We present a
+decentralized reinforcement learning framework to provide autonomous
+self-separation capabilities within AAM corridors with the use of speed and
+vertical maneuvers. The problem is formulated as a Markov Decision Process and
+solved by developing a novel extension to the sample-efficient, off-policy soft
+actor-critic (SAC) algorithm. We introduce the use of attention networks for
+variable-length observation processing and a distributed computing architecture
+to achieve high training sample throughput as compared to existing approaches.
+A comprehensive numerical study shows that the proposed framework can ensure
+safe and efficient separation of aircraft in high density, dynamic environments
+with various sources of uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Analysis of <span class="highlight-title">Transformer</span> Based Models (<span class="highlight-title">BERT</span>, AL<span class="highlight-title">BERT</span> and
+  Ro<span class="highlight-title">BERT</span>a) in Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafna Fitria Nur Azizah, Hasan Dwi Cahyono, Sari Widya Sihwi, Wisnu Widiarto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fake news is fake material in a news media format but is not processed
+properly by news agencies. The fake material can provoke or defame significant
+entities or individuals or potentially even for the personal interests of the
+creators, causing problems for society. Distinguishing fake news and real news
+is challenging due to limited of domain knowledge and time constraints.
+According to the survey, the top three areas most exposed to hoaxes and
+misinformation by residents are in Banten, DKI Jakarta and West Java. The model
+of transformers is referring to an approach in the field of artificial
+intelligence (AI) in natural language processing utilizing the deep learning
+architectures. Transformers exercise a powerful attention mechanism to process
+text in parallel and produce rich and contextual word representations. A
+previous study indicates a superior performance of a transformer model known as
+BERT over and above non transformer approach. However, some studies suggest the
+performance can be improved with the use of improved BERT models known as
+ALBERT and RoBERTa. However, the modified BERT models are not well explored for
+detecting fake news in Bahasa Indonesia. In this research, we explore those
+transformer models and found that ALBERT outperformed other models with 87.6%
+accuracy, 86.9% precision, 86.9% F1-score, and 174.5 run-time (s/epoch)
+respectively. Source code available at:
+https://github.com/Shafna81/fakenewsdetection.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Methods for Acquiring and Incorporating Knowledge into Stock Price
+  Prediction: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Wang, Jiawei Li, Lifan Zhao, Zhizhuo Kou, Xiaohan Wang, Xinyi Zhu, Hao Wang, Yanyan Shen, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting stock prices presents a challenging research problem due to the
+inherent volatility and non-linear nature of the stock market. In recent years,
+knowledge-enhanced stock price prediction methods have shown groundbreaking
+results by utilizing external knowledge to understand the stock market. Despite
+the importance of these methods, there is a scarcity of scholarly works that
+systematically synthesize previous studies from the perspective of external
+knowledge types. Specifically, the external knowledge can be modeled in
+different data structures, which we group into non-graph-based formats and
+graph-based formats: 1) non-graph-based knowledge captures contextual
+information and multimedia descriptions specifically associated with an
+individual stock; 2) graph-based knowledge captures interconnected and
+interdependent information in the stock market. This survey paper aims to
+provide a systematic and comprehensive description of methods for acquiring
+external knowledge from various unstructured data sources and then
+incorporating it into stock price prediction models. We also explore fusion
+methods for combining external knowledge with historical price features.
+Moreover, this paper includes a compilation of relevant datasets and delves
+into potential future research directions in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Private Graph Neural Network with Importance-Grained
+  Noise Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Qi, Xi Lin, Jun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) with differential privacy have been proposed to
+preserve graph privacy when nodes represent personal and sensitive information.
+However, the existing methods ignore that nodes with different importance may
+yield diverse privacy demands, which may lead to over-protect some nodes and
+decrease model utility. In this paper, we study the problem of
+importance-grained privacy, where nodes contain personal data that need to be
+kept private but are critical for training a GNN. We propose NAP-GNN, a
+node-importance-grained privacy-preserving GNN algorithm with privacy
+guarantees based on adaptive differential privacy to safeguard node
+information. First, we propose a Topology-based Node Importance Estimation
+(TNIE) method to infer unknown node importance with neighborhood and centrality
+awareness. Second, an adaptive private aggregation method is proposed to
+perturb neighborhood aggregation from node-importance-grain. Third, we propose
+to privately train a graph learning algorithm on perturbed aggregations in
+adaptive residual connection mode over multi-layers convolution for node-wise
+tasks. Theoretically analysis shows that NAP-GNN satisfies privacy guarantees.
+Empirical experiments over real-world graph datasets show that NAP-GNN achieves
+a better trade-off between privacy and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An In-Depth Analysis of Discretization Methods for Communication
+  Learning using Backpropagation with Multi-Agent Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Astrid Vanneste, Simon Vanneste, Kevin Mets, Tom De Schepper, Siegfried Mercelis, Peter Hellinckx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Communication is crucial in multi-agent reinforcement learning when agents
+are not able to observe the full state of the environment. The most common
+approach to allow learned communication between agents is the use of a
+differentiable communication channel that allows gradients to flow between
+agents as a form of feedback. However, this is challenging when we want to use
+discrete messages to reduce the message size, since gradients cannot flow
+through a discrete communication channel. Previous work proposed methods to
+deal with this problem. However, these methods are tested in different
+communication learning architectures and environments, making it hard to
+compare them. In this paper, we compare several state-of-the-art discretization
+methods as well as a novel approach. We do this comparison in the context of
+communication learning using gradients from other agents and perform tests on
+several environments. In addition, we present COMA-DIAL, a communication
+learning approach based on DIAL and COMA extended with learning rate scaling
+and adapted exploration. Using COMA-DIAL allows us to perform experiments on
+more complex environments. Our results show that the novel ST-DRU method,
+proposed in this paper, achieves the best results out of all discretization
+methods across the different environments. It achieves the best or close to the
+best performance in each of the experiments and is the only method that does
+not fail on any of the tested environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2204.05669</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JEDI: Joint Expert Distillation in a Semi-Supervised Multi-<span class="highlight-title">Dataset</span>
+  Student-Teacher Scenario for Video Action Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucian Bicsi, Bogdan Alexe, Radu Tudor Ionescu, Marius Leordeanu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose JEDI, a multi-dataset semi-supervised learning method, which
+efficiently combines knowledge from multiple experts, learned on different
+datasets, to train and improve the performance of individual, per dataset,
+student models. Our approach achieves this by addressing two important problems
+in current machine learning research: generalization across datasets and
+limitations of supervised training due to scarcity of labeled data. We start
+with an arbitrary number of experts, pretrained on their own specific dataset,
+which form the initial set of student models. The teachers are immediately
+derived by concatenating the feature representations from the penultimate
+layers of the students. We then train all models in a student-teacher
+semi-supervised learning scenario until convergence. In our efficient approach,
+student-teacher training is carried out jointly and end-to-end, showing that
+both students and teachers improve their generalization capacity during
+training. We validate our approach on four video action recognition datasets.
+By simultaneously considering all datasets within a unified semi-supervised
+setting, we demonstrate significant improvements over the initial experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV 2023 Workshops</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-Based Prediction of Fractional Flow Reserve along the
+  Coronary Artery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nils Hampe, Sanne G. M. van Velzen, Jean-Paul Aben, Carlos Collet, Ivana Išgum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Functionally significant coronary artery disease (CAD) is caused by plaque
+buildup in the coronary arteries, potentially leading to narrowing of the
+arterial lumen, i.e. coronary stenosis, that significantly obstructs blood flow
+to the myocardium. The current reference for establishing the presence of a
+functionally significant stenosis is invasive fractional flow reserve (FFR)
+measurement. To avoid invasive measurements, non-invasive prediction of FFR
+from coronary CT angiography (CCTA) has emerged. For this, machine learning
+approaches, characterized by fast inference, are increasingly developed.
+However, these methods predict a single FFR value per artery i.e. they don't
+provide information about the stenosis location or treatment strategy. We
+propose a deep learning-based method to predict the FFR along the artery from
+CCTA scans. This study includes CCTA images of 110 patients who underwent
+invasive FFR pullback measurement in 112 arteries. First, a multi planar
+reconstruction (MPR) of the artery is fed to a variational autoencoder to
+characterize the artery, i.e. through the lumen area and unsupervised artery
+encodings. Thereafter, a convolutional neural network (CNN) predicts the FFR
+along the artery. The CNN is supervised by multiple loss functions, notably a
+loss function inspired by the Earth Mover's Distance (EMD) to predict the
+correct location of FFR drops and a histogram-based loss to explicitly
+supervise the slope of the FFR curve. To train and evaluate our model,
+eight-fold cross-validation was performed. The resulting FFR curves show good
+agreement with the reference allowing the distinction between diffuse and focal
+CAD distributions in most cases. Quantitative evaluation yielded a mean
+absolute difference in the area under the FFR pullback curve (AUPC) of 1.7. The
+method may pave the way towards fast, accurate, automatic prediction of FFR
+along the artery from CCTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphCC: A Practical Graph Learning-based Approach to Congestion Control
+  in Datacenters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillermo Bernárdez, José Suárez-Varela, Xiang Shi, Shihan Xiao, Xiangle Cheng, Pere Barlet-Ros, Albert Cabellos-Aparicio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Congestion Control (CC) plays a fundamental role in optimizing traffic in
+Data Center Networks (DCN). Currently, DCNs mainly implement two main CC
+protocols: DCTCP and DCQCN. Both protocols -- and their main variants -- are
+based on Explicit Congestion Notification (ECN), where intermediate switches
+mark packets when they detect congestion. The ECN configuration is thus a
+crucial aspect on the performance of CC protocols. Nowadays, network experts
+set static ECN parameters carefully selected to optimize the average network
+performance. However, today's high-speed DCNs experience quick and abrupt
+changes that severely change the network state (e.g., dynamic traffic
+workloads, incast events, failures). This leads to under-utilization and
+sub-optimal performance. This paper presents GraphCC, a novel Machine
+Learning-based framework for in-network CC optimization. Our distributed
+solution relies on a novel combination of Multi-agent Reinforcement Learning
+(MARL) and Graph Neural Networks (GNN), and it is compatible with widely
+deployed ECN-based CC protocols. GraphCC deploys distributed agents on switches
+that communicate with their neighbors to cooperate and optimize the global ECN
+configuration. In our evaluation, we test the performance of GraphCC under a
+wide variety of scenarios, focusing on the capability of this solution to adapt
+to new scenarios unseen during training (e.g., new traffic workloads, failures,
+upgrades). We compare GraphCC with a state-of-the-art MARL-based solution for
+ECN tuning -- ACC -- and observe that our proposed solution outperforms the
+state-of-the-art baseline in all of the evaluation scenarios, showing
+improvements up to $20\%$ in Flow Completion Time as well as significant
+reductions in buffer occupancy ($38.0-85.7\%$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards true discovery of the differential equations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Hvatov, Roman Titov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential equation discovery, a machine learning subfield, is used to
+develop interpretable models, particularly in nature-related applications. By
+expertly incorporating the general parametric form of the equation of motion
+and appropriate differential terms, algorithms can autonomously uncover
+equations from data. This paper explores the prerequisites and tools for
+independent equation discovery without expert input, eliminating the need for
+equation form assumptions. We focus on addressing the challenge of assessing
+the adequacy of discovered equations when the correct equation is unknown, with
+the aim of providing insights for reliable equation discovery without prior
+knowledge of the equation form.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Knowledge and Logical Reasoning in the Era of Data-driven Learning
+  workshop at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targeted and Troublesome: Tracking and Advertising on Children's
+  Websites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Moti, Asuman Senol, Hamid Bostani, Frederik Zuiderveen Borgesius, Veelasha Moonsamy, Arunesh Mathur, Gunes Acar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On the modern web, trackers and advertisers frequently construct and monetize
+users' detailed behavioral profiles without consent. Despite various studies on
+web tracking mechanisms and advertisements, there has been no rigorous study
+focusing on websites targeted at children. To address this gap, we present a
+measurement of tracking and (targeted) advertising on websites directed at
+children. Motivated by lacking a comprehensive list of child-directed (i.e.,
+targeted at children) websites, we first build a multilingual classifier based
+on web page titles and descriptions. Applying this classifier to over two
+million pages, we compile a list of two thousand child-directed websites.
+Crawling these sites from five vantage points, we measure the prevalence of
+trackers, fingerprinting scripts, and advertisements. Our crawler detects ads
+displayed on child-directed websites and determines if ad targeting is enabled
+by scraping ad disclosure pages whenever available. Our results show that
+around 90% of child-directed websites embed one or more trackers, and about 27%
+contain targeted advertisements--a practice that should require verifiable
+parental consent. Next, we identify improper ads on child-directed websites by
+developing an ML pipeline that processes both images and text extracted from
+ads. The pipeline allows us to run semantic similarity queries for arbitrary
+search terms, revealing ads that promote services related to dating, weight
+loss, and mental health; as well as ads for sex toys and flirting chat
+services. Some of these ads feature repulsive and sexually explicit imagery. In
+summary, our findings indicate a trend of non-compliance with privacy
+regulations and troubling ad safety practices among many advertisers and
+child-directed websites. To protect children and create a safer online
+environment, regulators and stakeholders must adopt and enforce more stringent
+measures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decorrelating neurons using persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubén Ballester, Carles Casacuberta, Sergio Escalera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel way to improve the generalisation capacity of deep
+learning models by reducing high correlations between neurons. For this, we
+present two regularisation terms computed from the weights of a minimum
+spanning tree of the clique whose vertices are the neurons of a given network
+(or a sample of those), where weights on edges are correlation dissimilarities.
+We provide an extensive set of experiments to validate the effectiveness of our
+terms, showing that they outperform popular ones. Also, we demonstrate that
+naive minimisation of all correlations between neurons obtains lower accuracies
+than our regularisation terms, suggesting that redundancies play a significant
+role in artificial neural networks, as evidenced by some studies in
+neuroscience for real networks. We include a proof of differentiability of our
+regularisers, thus developing the first effective topological persistence-based
+regularisation terms that consider the whole set of neurons and that can be
+applied to a feedforward architecture in any deep learning task such as
+classification, data generation, or regression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalability of Message Encoding Techniques for Continuous Communication
+  Learned with Multi-Agent Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Astrid Vanneste, Thomas Somers, Simon Vanneste, Kevin Mets, Tom De Schepper, Siegfried Mercelis, Peter Hellinckx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many multi-agent systems require inter-agent communication to properly
+achieve their goal. By learning the communication protocol alongside the action
+protocol using multi-agent reinforcement learning techniques, the agents gain
+the flexibility to determine which information should be shared. However, when
+the number of agents increases we need to create an encoding of the information
+contained in these messages. In this paper, we investigate the effect of
+increasing the amount of information that should be contained in a message and
+increasing the number of agents. We evaluate these effects on two different
+message encoding methods, the mean message encoder and the attention message
+encoder. We perform our experiments on a matrix environment. Surprisingly, our
+results show that the mean message encoder consistently outperforms the
+attention message encoder. Therefore, we analyse the communication protocol
+used by the agents that use the mean message encoder and can conclude that the
+agents use a combination of an exponential and a logarithmic function in their
+communication policy to avoid the loss of important information after applying
+the mean message encoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to the BNAIC/BeNeLearn 2022 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intrinsic Motivation via Surprise Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Le, Kien Do, Dung Nguyen, Svetha Venkatesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new computing model for intrinsic rewards in reinforcement
+learning that addresses the limitations of existing surprise-driven
+explorations. The reward is the novelty of the surprise rather than the
+surprise norm. We estimate the surprise novelty as retrieval errors of a memory
+network wherein the memory stores and reconstructs surprises. Our surprise
+memory (SM) augments the capability of surprise-based intrinsic motivators,
+maintaining the agent's interest in exciting exploration while reducing
+unwanted attraction to unpredictable or noisy observations. Our experiments
+demonstrate that the SM combined with various surprise predictors exhibits
+efficient exploring behaviors and significantly boosts the final performance in
+sparse reward environments, including Noisy-TV, navigation and challenging
+Atari games.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TSSR: A Truncated and Signed Square Root Activation Function for Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Activation functions are essential components of neural networks. In this
+paper, we introduce a new activation function called the Truncated and Signed
+Square Root (TSSR) function. This function is distinctive because it is odd,
+nonlinear, monotone and differentiable. Its gradient is continuous and always
+positive. Thanks to these properties, it has the potential to improve the
+numerical stability of neural networks. Several experiments confirm that the
+proposed TSSR has better performance than other stat-of-the-art activation
+functions. The proposed function has significant implications for the
+development of neural network models and can be applied to a wide range of
+applications in fields such as computer vision, natural language processing,
+and speech recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2307.16389</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayes Risk Consistency of Nonparametric Classification Rules for Spike
+  Trains Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirosław Pawlak, Mateusz Pabian, Dominik Rzepka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spike trains data find a growing list of applications in computational
+neuroscience, imaging, streaming data and finance. Machine learning strategies
+for spike trains are based on various neural network and probabilistic models.
+The probabilistic approach is relying on parametric or nonparametric
+specifications of the underlying spike generation model. In this paper we
+consider the two-class statistical classification problem for a class of spike
+train data characterized by nonparametrically specified intensity functions. We
+derive the optimal Bayes rule and next form the plug-in nonparametric kernel
+classifier. Asymptotical properties of the rules are established including the
+limit with respect to the increasing recording time interval and the size of a
+training set. In particular the convergence of the kernel classifier to the
+Bayes rule is proved. The obtained results are supported by a finite sample
+simulation studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PETformer: Long-term Time Series Forecasting via Placeholder-enhanced
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengsheng Lin, Weiwei Lin, Wentai Wu, Songbo Wang, Yongxiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Transformer-based models have shown remarkable performance in
+long-term time series forecasting (LTSF) tasks due to their ability to model
+long-term dependencies. However, the validity of Transformers for LTSF tasks
+remains debatable, particularly since recent work has shown that simple linear
+models can outperform numerous Transformer-based approaches. This suggests that
+there are limitations to the application of Transformer in LTSF. Therefore,
+this paper investigates three key issues when applying Transformer to LTSF:
+temporal continuity, information density, and multi-channel relationships.
+Accordingly, we propose three innovative solutions, including Placeholder
+Enhancement Technique (PET), Long Sub-sequence Division (LSD), and
+Multi-channel Separation and Interaction (MSI), which together form a novel
+model called PETformer. These three key designs introduce prior biases suitable
+for LTSF tasks. Extensive experiments have demonstrated that PETformer achieves
+state-of-the-art (SOTA) performance on eight commonly used public datasets for
+LTSF, outperforming all other models currently available. This demonstrates
+that Transformer still possesses powerful capabilities in LTSF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SUnAA: Sparse Unmixing using Archetypal Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behnood Rasti, Alexandre Zouaoui, Julien Mairal, Jocelyn Chanussot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new sparse unmixing technique using archetypal
+analysis (SUnAA). First, we design a new model based on archetypal analysis. We
+assume that the endmembers of interest are a convex combination of endmembers
+provided by a spectral library and that the number of endmembers of interest is
+known. Then, we propose a minimization problem. Unlike most conventional sparse
+unmixing methods, here the minimization problem is non-convex. We minimize the
+optimization objective iteratively using an active set algorithm. Our method is
+robust to the initialization and only requires the number of endmembers of
+interest. SUnAA is evaluated using two simulated datasets for which results
+confirm its better performance over other conventional and advanced techniques
+in terms of signal-to-reconstruction error. SUnAA is also applied to Cuprite
+dataset and the results are compared visually with the available geological map
+provided for this dataset. The qualitative assessment demonstrates the
+successful estimation of the minerals abundances and significantly improves the
+detection of dominant minerals compared to the conventional regression-based
+sparse unmixing methods. The Python implementation of SUnAA can be found at:
+https://github.com/BehnoodRasti/SUnAA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tram-FL: Routing-based Model Training for Decentralized Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kota Maejima, Takayuki Nishio, Asato Yamazaki, Yuko Hara-Azumi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In decentralized federated learning (DFL), substantial traffic from frequent
+inter-node communication and non-independent and identically distributed
+(non-IID) data challenges high-accuracy model acquisition. We propose Tram-FL,
+a novel DFL method, which progressively refines a global model by transferring
+it sequentially amongst nodes, rather than by exchanging and aggregating local
+models. We also introduce a dynamic model routing algorithm for optimal route
+selection, aimed at enhancing model precision with minimal forwarding. Our
+experiments using MNIST, CIFAR-10, and IMDb datasets demonstrate that Tram-FL
+with the proposed routing delivers high model accuracy under non-IID
+conditions, outperforming baselines while reducing communication costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Matching Data Synthesis for Non-IID Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Li, Yuchang Sun, Jiawei Shao, Yuyi Mao, Jessie Hui Wang, Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a privacy-preserving paradigm that
+trains neural networks on edge devices without collecting data at a central
+server. However, FL encounters an inherent challenge in dealing with
+non-independent and identically distributed (non-IID) data among devices. To
+address this challenge, this paper proposes a hard feature matching data
+synthesis (HFMDS) method to share auxiliary data besides local models.
+Specifically, synthetic data are generated by learning the essential
+class-relevant features of real samples and discarding the redundant features,
+which helps to effectively tackle the non-IID issue. For better privacy
+preservation, we propose a hard feature augmentation method to transfer real
+features towards the decision boundary, with which the synthetic data not only
+improve the model generalization but also erase the information of real
+features. By integrating the proposed HFMDS method with FL, we present a novel
+FL framework with data augmentation to relieve data heterogeneity. The
+theoretical analysis highlights the effectiveness of our proposed data
+synthesis method in solving the non-IID challenge. Simulation results further
+demonstrate that our proposed HFMDS-FL algorithm outperforms the baselines in
+terms of accuracy, privacy preservation, and computational cost on various
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Learning From Distributed Data With Differentially Private
+  Synthetic Twin Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Prediger, Joonas Jälkö, Antti Honkela, Samuel Kaski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consider a setting where multiple parties holding sensitive data aim to
+collaboratively learn population level statistics, but pooling the sensitive
+data sets is not possible. We propose a framework in which each party shares a
+differentially private synthetic twin of their data. We study the feasibility
+of combining such synthetic twin data sets for collaborative learning on
+real-world health data from the UK Biobank. We discover that parties engaging
+in the collaborative learning via shared synthetic data obtain more accurate
+estimates of target statistics compared to using only their local data. This
+finding extends to the difficult case of small heterogeneous data sets.
+Furthermore, the more parties participate, the larger and more consistent the
+improvements become. Finally, we find that data sharing can especially help
+parties whose data contain underrepresented groups to perform better-adjusted
+analysis for said groups. Based on our results we conclude that sharing of
+synthetic twins is a viable method for enabling learning from sensitive data
+without violating privacy constraints even if individual data sets are small or
+do not represent the overall population well. The setting of distributed
+sensitive data is often a bottleneck in biomedical research, which our study
+shows can be alleviated with privacy-preserving collaborative learning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Fuzzing via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunqiu Steven Xia, Matteo Paltenghi, Jia Le Tian, Michael Pradel, Lingming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fuzzing has achieved tremendous success in discovering bugs and
+vulnerabilities in various software systems. Systems under test (SUTs) that
+take in programming or formal language as inputs, e.g., compilers, runtime
+engines, constraint solvers, and software libraries with accessible APIs, are
+especially important as they are fundamental building blocks of software
+development. However, existing fuzzers for such systems often target a specific
+language, and thus cannot be easily applied to other languages or even other
+versions of the same language. Moreover, the inputs generated by existing
+fuzzers are often limited to specific features of the input language, and thus
+can hardly reveal bugs related to other or new features. This paper presents
+Fuzz4All, the first fuzzer that is universal in the sense that it can target
+many different input languages and many different features of these languages.
+The key idea behind Fuzz4All is to leverage large language models (LLMs) as an
+input generation and mutation engine, which enables the approach to produce
+diverse and realistic inputs for any practically relevant language. To realize
+this potential, we present a novel autoprompting technique, which creates LLM
+prompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop,
+which iteratively updates the prompt to create new fuzzing inputs. We evaluate
+Fuzz4All on nine systems under test that take in six different languages (C,
+C++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six
+languages, that universal fuzzing achieves higher coverage than existing,
+language-specific fuzzers. Furthermore, Fuzz4All has identified 76 bugs in
+widely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit
+quantum computing platform, with 47 bugs already confirmed by developers as
+previously unknown.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing a <span class="highlight-title">Transformer</span>-based network for a deep learning seismic
+  processing workflow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Randy Harsuko, Tariq Alkhalifah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  StorSeismic is a recently introduced model based on the Transformer to adapt
+to various seismic processing tasks through its pretraining and fine-tuning
+training strategy. In the original implementation, StorSeismic utilized a
+sinusoidal positional encoding and a conventional self-attention mechanism,
+both borrowed from the natural language processing (NLP) applications. For
+seismic processing they admitted good results, but also hinted to limitations
+in efficiency and expressiveness. We propose modifications to these two key
+components, by utilizing relative positional encoding and low-rank attention
+matrices as replacements to the vanilla ones. The proposed changes are tested
+on processing tasks applied to a realistic Marmousi and offshore field data as
+a sequential strategy, starting from denoising, direct arrival removal,
+multiple attenuation, and finally root-mean-squared velocity ($V_{RMS}$)
+prediction for normal moveout (NMO) correction. We observe faster pretraining
+and competitive results on the fine-tuning tasks and, additionally, fewer
+parameters to train compared to the vanilla model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Going Deeper with Five-point Stencil Convolutions for Reaction-Diffusion
+  Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongho Kim, Yongho Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks have been widely applied to partial
+differential equations with great success because the physics-informed loss
+essentially requires no observations or discretization. However, it is
+difficult to optimize model parameters, and these parameters must be trained
+for each distinct initial condition. To overcome these challenges in
+second-order reaction-diffusion type equations, a possible way is to use
+five-point stencil convolutional neural networks (FCNNs). FCNNs are trained
+using two consecutive snapshots, where the time step corresponds to the step
+size of the given snapshots. Thus, the time evolution of FCNNs depends on the
+time step, and the time step must satisfy its CFL condition to avoid blow-up
+solutions. In this work, we propose deep FCNNs that have large receptive fields
+to predict time evolutions with a time step larger than the threshold of the
+CFL condition. To evaluate our models, we consider the heat, Fisher's, and
+Allen-Cahn equations with diverse initial conditions. We demonstrate that deep
+FCNNs retain certain accuracies, in contrast to FDMs that blow up.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 table, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JEN-1: Text-Guided Universal Music Generation with Omnidirectional
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peike Li, Boyu Chen, Yao Yao, Yikai Wang, Allen Wang, Alex Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music generation has attracted growing interest with the advancement of deep
+generative models. However, generating music conditioned on textual
+descriptions, known as text-to-music, remains challenging due to the complexity
+of musical structures and high sampling rate requirements. Despite the task's
+significance, prevailing generative models exhibit limitations in music
+quality, computational efficiency, and generalization. This paper introduces
+JEN-1, a universal high-fidelity model for text-to-music generation. JEN-1 is a
+diffusion model incorporating both autoregressive and non-autoregressive
+training. Through in-context learning, JEN-1 performs various generation tasks
+including text-guided music generation, music inpainting, and continuation.
+Evaluations demonstrate JEN-1's superior performance over state-of-the-art
+methods in text-music alignment and music quality while maintaining
+computational efficiency. Our demos are available at
+http://futureverse.com/research/jen/demos/jen1
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Slot Induction via <span class="highlight-title">Pre-train</span>ed Language Model Probing and Multi-level
+  Contrastive Learning <span class="chip">SIGDIAL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang H. Nguyen, Chenwei Zhang, Ye Liu, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advanced methods in Natural Language Understanding for Task-oriented
+Dialogue (TOD) Systems (e.g., intent detection and slot filling) require a
+large amount of annotated data to achieve competitive performance. In reality,
+token-level annotations (slot labels) are time-consuming and difficult to
+acquire. In this work, we study the Slot Induction (SI) task whose objective is
+to induce slot boundaries without explicit knowledge of token-level slot
+annotations. We propose leveraging Unsupervised Pre-trained Language Model
+(PLM) Probing and Contrastive Learning mechanism to exploit (1) unsupervised
+semantic knowledge extracted from PLM, and (2) additional sentence-level intent
+label signals available from TOD. Our approach is shown to be effective in SI
+task and capable of bridging the gaps with token-level supervised models on two
+NLU benchmark datasets. When generalized to emerging intents, our SI objectives
+also provide enhanced slot label representations, leading to improved
+performance on the Slot Filling tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGDIAL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Perturbation Analysis for Probabilistic Black-Box Anomaly
+  Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tsuyoshi Idé, Naoki Abe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the task of probabilistic anomaly attribution in the black-box
+regression setting, where the goal is to compute the probability distribution
+of the attribution score of each input variable, given an observed anomaly. The
+training dataset is assumed to be unavailable. This task differs from the
+standard XAI (explainable AI) scenario, since we wish to explain the anomalous
+deviation from a black-box prediction rather than the black-box model itself.
+  We begin by showing that mainstream model-agnostic explanation methods, such
+as the Shapley values, are not suitable for this task because of their
+``deviation-agnostic property.'' We then propose a novel framework for
+probabilistic anomaly attribution that allows us to not only compute
+attribution scores as the predictive mean but also quantify the uncertainty of
+those scores. This is done by considering a generative process for
+perturbations that counter-factually bring the observed anomalous observation
+back to normalcy. We introduce a variational Bayes algorithm for deriving the
+distributions of per variable attribution scores. To the best of our knowledge,
+this is the first probabilistic anomaly attribution framework that is free from
+being deviation-agnostic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pareto Invariant Representation Learning for Multimedia Recommendation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Huang, Haoxuan Li, Qingsong Li, Chunyuan Zheng, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommendation involves personalized ranking tasks, where
+multimedia content is usually represented using a generic encoder. However,
+these generic representations introduce spurious correlations that fail to
+reveal users' true preferences. Existing works attempt to alleviate this
+problem by learning invariant representations, but overlook the balance between
+independent and identically distributed (IID) and out-of-distribution (OOD)
+generalization. In this paper, we propose a framework called Pareto Invariant
+Representation Learning (PaInvRL) to mitigate the impact of spurious
+correlations from an IID-OOD multi-objective optimization perspective, by
+learning invariant representations (intrinsic factors that attract user
+attention) and variant representations (other factors) simultaneously.
+Specifically, PaInvRL includes three iteratively executed modules: (i)
+heterogeneous identification module, which identifies the heterogeneous
+environments to reflect distributional shifts for user-item interactions; (ii)
+invariant mask generation module, which learns invariant masks based on the
+Pareto-optimal solutions that minimize the adaptive weighted Invariant Risk
+Minimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which
+generates both variant representations and item-invariant representations for
+training a multi-modal recommendation model that mitigates spurious
+correlations and balances the generalization performance within and cross the
+environmental distributions. We compare the proposed PaInvRL with
+state-of-the-art recommendation models on three public multimedia
+recommendation datasets (Movielens, Tiktok, and Kwai), and the experimental
+results validate the effectiveness of PaInvRL for both within- and
+cross-environmental learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Feature Set of Small Size for the PDF Malware Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Liu, Charles Nicholas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML)-based malware detection systems are becoming
+increasingly important as malware threats increase and get more sophisticated.
+PDF files are often used as vectors for phishing attacks because they are
+widely regarded as trustworthy data resources, and are accessible across
+different platforms. Therefore, researchers have developed many different PDF
+malware detection methods. Performance in detecting PDF malware is greatly
+influenced by feature selection. In this research, we propose a small features
+set that don't require too much domain knowledge of the PDF file. We evaluate
+proposed features with six different machine learning models. We report the
+best accuracy of 99.75% when using Random Forest model. Our proposed feature
+set, which consists of just 12 features, is one of the most conciseness in the
+field of PDF malware detection. Despite its modest size, we obtain comparable
+results to state-of-the-art that employ a much larger set of features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the KDD workshop on Knowledge-infused
+  Machine Learning, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Analytical Study of Covid-19 <span class="highlight-title">Dataset</span> using Graph-Based Clustering
+  Algorithms <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamata Das, P. J. A. Alphonse, Selvakumar K
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Corona VIrus Disease abbreviated as COVID-19 is a novel virus which is
+initially identified in Wuhan of China in December of 2019 and now this deadly
+disease has spread all over the world. According to World Health Organization
+(WHO), a total of 3,124,905 people died from 2019 to 2021, April. In this case,
+many methods, AI base techniques, and machine learning algorithms have been
+researched and are being used to save people from this pandemic. The SARS-CoV
+and the 2019-nCoV, SARS-CoV-2 virus invade our bodies, causing some differences
+in the structure of cell proteins. Protein-protein interaction (PPI) is an
+essential process in our cells and plays a very important role in the
+development of medicines and gives ideas about the disease. In this study, we
+performed clustering on PPI networks generated from 92 genes of the Covi-19
+dataset. We have used three graph-based clustering algorithms to give intuition
+to the analysis of clusters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 28 figures, Fifth International Conference on Smart
+  Computing and Informatics (SCI 2021)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable AI in Orthopedics: Challenges, Opportunities, and Prospects <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soheyla Amirian, Luke A. Carlson, Matthew F. Gong, Ines Lohse, Kurt R. Weiss, Johannes F. Plate, Ahmad P. Tafti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While artificial intelligence (AI) has made many successful applications in
+various domains, its adoption in healthcare lags a little bit behind other
+high-stakes settings. Several factors contribute to this slower uptake,
+including regulatory frameworks, patient privacy concerns, and data
+heterogeneity. However, one significant challenge that impedes the
+implementation of AI in healthcare, particularly in orthopedics, is the lack of
+explainability and interpretability around AI models. Addressing the challenge
+of explainable AI (XAI) in orthopedics requires developing AI models and
+algorithms that prioritize transparency and interpretability, allowing
+clinicians, surgeons, and patients to understand the contributing factors
+behind any AI-powered predictive or descriptive models. The current
+contribution outlines several key challenges and opportunities that manifest in
+XAI in orthopedic practice. This work emphasizes the need for interdisciplinary
+collaborations between AI practitioners, orthopedic specialists, and regulatory
+entities to establish standards and guidelines for the adoption of XAI in
+orthopedics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted at The 2023 World Congress in Computer
+  Science, Computer Engineering, and Applied Computing (CSCE'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finite Element Operator Network for Solving Parametric PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Yong Lee, Seungchan Ko, Youngjoon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial differential equations (PDEs) underlie our understanding and
+prediction of natural phenomena across numerous fields, including physics,
+engineering, and finance. However, solving parametric PDEs is a complex task
+that necessitates efficient numerical methods. In this paper, we propose a
+novel approach for solving parametric PDEs using a Finite Element Operator
+Network (FEONet). Our proposed method leverages the power of deep learning in
+conjunction with traditional numerical methods, specifically the finite element
+method, to solve parametric PDEs in the absence of any paired input-output
+training data. We demonstrate the effectiveness of our approach on several
+benchmark problems and show that it outperforms existing state-of-the-art
+methods in terms of accuracy, generalization, and computational flexibility.
+Our FEONet framework shows potential for application in various fields where
+PDEs play a crucial role in modeling complex domains with diverse boundary
+conditions and singular behavior. Furthermore, we provide theoretical
+convergence analysis to support our approach, utilizing finite element
+approximation in numerical analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Implicit Framework for Fast NeRF Composition and Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Gao, Ziyi Yang, Yunlu Zhao, Yuxiang Sun, Xiaogang Jin, Changqing Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a variety of Neural radiance fields methods have garnered
+remarkable success in high render speed. However, current accelerating methods
+is specialized and not compatible for various implicit method, which prevent a
+real-time composition over different kinds of NeRF works. Since NeRF relies on
+sampling along rays, it's possible to provide a guidance generally. We propose
+a general implicit pipeline to rapidly compose NeRF objects. This new method
+enables the casting of dynamic shadows within or between objects using
+analytical light sources while allowing multiple NeRF objects to be seamlessly
+placed and rendered together with any arbitrary rigid transformations. Mainly,
+our work introduces a new surface representation known as Neural Depth Fields
+(NeDF) that quickly determines the spatial relationship between objects by
+allowing direct intersection computation between rays and implicit surfaces. It
+leverages an intersection neural network to query NeRF for acceleration instead
+of depending on an explicit spatial structure.Our proposed method is the first
+to enable both the progressive and interactive composition of NeRF objects.
+Additionally, it also serves as a previewing plugin for a range of existing
+NeRF works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages for main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of lung cancer subtypes on CT images with synthetic
+  pathological priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Yuan Jin, Gege Ma, Geng Chen, Jan Egger, Shaoting Zhang, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate diagnosis on pathological subtypes for lung cancer is of
+significant importance for the follow-up treatments and prognosis managements.
+In this paper, we propose self-generating hybrid feature network (SGHF-Net) for
+accurately classifying lung cancer subtypes on computed tomography (CT) images.
+Inspired by studies stating that cross-scale associations exist in the image
+patterns between the same case's CT images and its pathological images, we
+innovatively developed a pathological feature synthetic module (PFSM), which
+quantitatively maps cross-modality associations through deep neural networks,
+to derive the "gold standard" information contained in the corresponding
+pathological images from CT images. Additionally, we designed a radiological
+feature extraction module (RFEM) to directly acquire CT image information and
+integrated it with the pathological priors under an effective feature fusion
+framework, enabling the entire classification model to generate more indicative
+and specific pathologically related features and eventually output more
+accurate predictions. The superiority of the proposed model lies in its ability
+to self-generate hybrid features that contain multi-modality image information
+based on a single-modality input. To evaluate the effectiveness, adaptability,
+and generalization ability of our model, we performed extensive experiments on
+a large-scale multi-center dataset (i.e., 829 cases from three hospitals) to
+compare our model and a series of state-of-the-art (SOTA) classification
+models. The experimental results demonstrated the superiority of our model for
+lung cancer subtypes classification with significant accuracy improvements in
+terms of accuracy (ACC), area under the curve (AUC), and F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Bayesian Optimization with Deep Kernel Learning and
+  <span class="highlight-title">Transformer</span> <span class="highlight-title">Pre-train</span>ed on Multiple Heterogeneous <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlong Lyu, Shoubo Hu, Jie Chuai, Zhitang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) is widely adopted in black-box optimization
+problems and it relies on a surrogate model to approximate the black-box
+response function. With the increasing number of black-box optimization tasks
+solved and even more to solve, the ability to learn from multiple prior tasks
+to jointly pre-train a surrogate model is long-awaited to further boost
+optimization efficiency. In this paper, we propose a simple approach to
+pre-train a surrogate, which is a Gaussian process (GP) with a kernel defined
+on deep features learned from a Transformer-based encoder, using datasets from
+prior tasks with possibly heterogeneous input spaces. In addition, we provide a
+simple yet effective mix-up initialization strategy for input tokens
+corresponding to unseen input variables and therefore accelerate new tasks'
+convergence. Experiments on both synthetic and real benchmark problems
+demonstrate the effectiveness of our proposed pre-training and transfer BO
+strategy over existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing the performance of deep learning-based models for prostate
+  cancer segmentation using uncertainty scores <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Cesar Quihui-Rubio, Daniel Flores-Araiza, Gilberto Ochoa-Ruiz, Miguel Gonzalez-Mendoza, Christian Mata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study focuses on comparing deep learning methods for the segmentation
+and quantification of uncertainty in prostate segmentation from MRI images. The
+aim is to improve the workflow of prostate cancer detection and diagnosis.
+Seven different U-Net-based architectures, augmented with Monte-Carlo dropout,
+are evaluated for automatic segmentation of the central zone, peripheral zone,
+transition zone, and tumor, with uncertainty estimation. The top-performing
+model in this study is the Attention R2U-Net, achieving a mean Intersection
+over Union (IoU) of 76.3% and Dice Similarity Coefficient (DSC) of 85% for
+segmenting all zones. Additionally, Attention R2U-Net exhibits the lowest
+uncertainty values, particularly in the boundaries of the transition zone and
+tumor, when compared to the other models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Article accepted at Cancer Prevention through early detecTion
+  (CaPtTion) workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Metric Learning for the Hemodynamics Inference with
+  Electrocardiogram Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyewon Jeong, Collin M. Stultz, Marzyeh Ghassemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heart failure is a debilitating condition that affects millions of people
+worldwide and has a significant impact on their quality of life and mortality
+rates. An objective assessment of cardiac pressures remains an important method
+for the diagnosis and treatment prognostication for patients with heart
+failure. Although cardiac catheterization is the gold standard for estimating
+central hemodynamic pressures, it is an invasive procedure that carries
+inherent risks, making it a potentially dangerous procedure for some patients.
+Approaches that leverage non-invasive signals - such as electrocardiogram (ECG)
+- have the promise to make the routine estimation of cardiac pressures feasible
+in both inpatient and outpatient settings. Prior models trained to estimate
+intracardiac pressures (e.g., mean pulmonary capillary wedge pressure (mPCWP))
+in a supervised fashion have shown good discriminatory ability but have been
+limited to the labeled dataset from the heart failure cohort. To address this
+issue and build a robust representation, we apply deep metric learning (DML)
+and propose a novel self-supervised DML with distance-based mining that
+improves the performance of a model with limited labels. We use a dataset that
+contains over 5.4 million ECGs without concomitant central pressure labels to
+pre-train a self-supervised DML model which showed improved classification of
+elevated mPCWP compared to self-supervised contrastive baselines. Additionally,
+the supervised DML model that is using ECGs with access to 8,172 mPCWP labels
+demonstrated significantly better performance on the mPCWP regression task
+compared to the supervised baseline. Moreover, our data suggest that DML yields
+models that are performant across patient subgroups, even when some patient
+subgroups are under-represented in the dataset. Our code is available at
+https://github.com/mandiehyewon/ssldml
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Optimization Performance: A Novel Hybridization of Gaussian
+  Crunching Search and Powell's Method for Derivative-Free Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benny Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper presents a novel approach to enhance optimization
+performance through the hybridization of Gaussian Crunching Search (GCS) and
+Powell's Method for derivative-free optimization. While GCS has shown promise
+in overcoming challenges faced by traditional derivative-free optimization
+methods [1], it may not always excel in finding the local minimum. On the other
+hand, some traditional methods may have better performance in this regard.
+However, GCS demonstrates its strength in escaping the trap of local minima and
+approaching the global minima. Through experimentation, we discovered that by
+combining GCS with certain traditional derivative-free optimization methods, we
+can significantly boost performance while retaining the respective advantages
+of each method. This hybrid approach opens up new possibilities for optimizing
+complex systems and finding optimal solutions in a range of applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Binary <span class="highlight-title">Transformer</span>s for Multivariate Time Series Modeling <span class="chip">KDD '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matt Gorbett, Hossein Shirazi, Indrakshi Ray
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressed Neural Networks have the potential to enable deep learning across
+new applications and smaller computational environments. However, understanding
+the range of learning tasks in which such models can succeed is not well
+studied. In this work, we apply sparse and binary-weighted Transformers to
+multivariate time series problems, showing that the lightweight models achieve
+accuracy comparable to that of dense floating-point Transformers of the same
+structure. Our model achieves favorable results across three time series
+learning tasks: classification, anomaly detection, and single-step forecasting.
+Additionally, to reduce the computational complexity of the attention
+mechanism, we apply two modifications, which show little to no decline in model
+performance: 1) in the classification task, we apply a fixed mask to the query,
+key, and value activations, and 2) for forecasting and anomaly detection, which
+rely on predicting outputs at a single point in time, we propose an attention
+mask to allow computation only at the current time step. Together, each
+compression technique and attention modification substantially reduces the
+number of non-zero operations necessary in the Transformer. We measure the
+computational savings of our approach over a range of metrics including
+parameter count, bit size, and floating point operation (FLOPs) count, showing
+up to a 53x reduction in storage size and up to 10.5x reduction in FLOPs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at KDD '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven Intra-Autonomous Systems Graph Generator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caio Vinicius Dadauto, Nelson Luis Saldanha da Fonseca, Ricardo da Silva Torres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel deep-learning based generator of synthetic
+graphs that represent intra-Autonomous System (AS) in the Internet, named
+Deep-generative graphs for the Internet (DGGI). It also presents a novel
+massive dataset of real intra-AS graphs extracted from the project Internet
+Topology Data Kit (ITDK), called Internet Graphs (IGraphs). To create IGraphs,
+the Filtered Recurrent Multi-level (FRM) algorithm for community extraction was
+developed. It is shown that DGGI creates synthetic graphs which accurately
+reproduce the properties of centrality, clustering, assortativity, and node
+degree. The DGGI generator overperforms existing Internet topology generators.
+On average, DGGI improves the Maximum Mean Discrepancy (MMD) metric 84.4%,
+95.1%, 97.9%, and 94.7% for assortativity, betweenness, clustering, and node
+degree, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-Enabled Software and System Architecture Frameworks: Focusing on
+  smart Cyber-Physical Systems (CPS) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armin Moin, Atta Badii, Stephan Günnemann, Moharram Challenger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Several architecture frameworks for software, systems, and enterprises have
+been proposed in the literature. They identified various stakeholders and
+defined architecture viewpoints and views to frame and address stakeholder
+concerns. However, the stakeholders with data science and Machine Learning (ML)
+related concerns, such as data scientists and data engineers, are yet to be
+included in existing architecture frameworks. Therefore, they failed to address
+the architecture viewpoints and views responsive to the concerns of the data
+science community. In this paper, we address this gap by establishing the
+architecture frameworks adapted to meet the requirements of modern applications
+and organizations where ML artifacts are both prevalent and crucial. In
+particular, we focus on ML-enabled Cyber-Physical Systems (CPSs) and propose
+two sets of merit criteria for their efficient development and performance
+assessment, namely the criteria for evaluating and benchmarking ML-enabled
+CPSs, and the criteria for evaluation and benchmarking of the tools intended to
+support users through the modeling and development pipeline. In this study, we
+deploy multiple empirical and qualitative research methods based on literature
+review and survey instruments including expert interviews and an online
+questionnaire. We collect, analyze, and integrate the opinions of 77 experts
+from more than 25 organizations in over 10 countries to devise and validate the
+proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Financial Fraud Detection: A Comparative Study of Quantum Machine
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nouhaila Innan, Muhammad Al-Zafar Khan, Mohamed Bennai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research, a comparative study of four Quantum Machine Learning (QML)
+models was conducted for fraud detection in finance. We proved that the Quantum
+Support Vector Classifier model achieved the highest performance, with F1
+scores of 0.98 for fraud and non-fraud classes. Other models like the
+Variational Quantum Classifier, Estimator Quantum Neural Network (QNN), and
+Sampler QNN demonstrate promising results, propelling the potential of QML
+classification for financial applications. While they exhibit certain
+limitations, the insights attained pave the way for future enhancements and
+optimisation strategies. However, challenges exist, including the need for more
+efficient Quantum algorithms and larger and more complex datasets. The article
+provides solutions to overcome current limitations and contributes new insights
+to the field of Quantum Machine Learning in fraud detection, with important
+implications for its future development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 15 figures, and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial Gated Multi-Layer Perceptron for Land Use and Land Cover Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Jamali, Swalpa Kumar Roy, Danfeng Hong, Peter M Atkinson, Pedram Ghamisi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) are models that are utilized extensively
+for the hierarchical extraction of features. Vision transformers (ViTs),
+through the use of a self-attention mechanism, have recently achieved superior
+modeling of global contextual information compared to CNNs. However, to realize
+their image classification strength, ViTs require substantial training
+datasets. Where the available training data are limited, current advanced
+multi-layer perceptrons (MLPs) can provide viable alternatives to both deep
+CNNs and ViTs. In this paper, we developed the SGU-MLP, a learning algorithm
+that effectively uses both MLPs and spatial gating units (SGUs) for precise
+land use land cover (LULC) mapping. Results illustrated the superiority of the
+developed SGU-MLP classification algorithm over several CNN and CNN-ViT-based
+models, including HybridSN, ResNet, iFormer, EfficientFormer and CoAtNet. The
+proposed SGU-MLP algorithm was tested through three experiments in Houston,
+USA, Berlin, Germany and Augsburg, Germany. The SGU-MLP classification model
+was found to consistently outperform the benchmark CNN and CNN-ViT-based
+algorithms. For example, for the Houston experiment, SGU-MLP significantly
+outperformed HybridSN, CoAtNet, Efficientformer, iFormer and ResNet by
+approximately 15%, 19%, 20%, 21%, and 25%, respectively, in terms of average
+accuracy. The code will be made publicly available at
+https://github.com/aj1365/SGUMLP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted in IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging the Edge and Cloud for V2X-Based Real-Time Object Detection
+  in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Hawlader, François Robinet, Raphaël Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Environmental perception is a key element of autonomous driving because the
+information received from the perception module influences core driving
+decisions. An outstanding challenge in real-time perception for autonomous
+driving lies in finding the best trade-off between detection quality and
+latency. Major constraints on both computation and power have to be taken into
+account for real-time perception in autonomous vehicles. Larger object
+detection models tend to produce the best results, but are also slower at
+runtime. Since the most accurate detectors cannot run in real-time locally, we
+investigate the possibility of offloading computation to edge and cloud
+platforms, which are less resource-constrained. We create a synthetic dataset
+to train object detection models and evaluate different offloading strategies.
+Using real hardware and network simulations, we compare different trade-offs
+between prediction quality and end-to-end delay. Since sending raw frames over
+the network implies additional transmission delays, we also explore the use of
+JPEG and H.265 compression at varying qualities and measure their impact on
+prediction metrics. We show that models with adequate compression can be run in
+real-time on the cloud while outperforming local detection performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegMatch: A semi-supervised learning method for surgical instrument
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wei, Charlie Budd, Luis C. Garcia-Peraza-Herrera, Reuben Dorent, Miaojing Shi, Tom Vercauteren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical instrument segmentation is recognised as a key enabler to provide
+advanced surgical assistance and improve computer assisted interventions. In
+this work, we propose SegMatch, a semi supervised learning method to reduce the
+need for expensive annotation for laparoscopic and robotic surgical images.
+SegMatch builds on FixMatch, a widespread semi supervised classification
+pipeline combining consistency regularization and pseudo labelling, and adapts
+it for the purpose of segmentation. In our proposed SegMatch, the unlabelled
+images are weakly augmented and fed into the segmentation model to generate a
+pseudo-label to enforce the unsupervised loss against the output of the model
+for the adversarial augmented image on the pixels with a high confidence score.
+Our adaptation for segmentation tasks includes carefully considering the
+equivariance and invariance properties of the augmentation functions we rely
+on. To increase the relevance of our augmentations, we depart from using only
+handcrafted augmentations and introduce a trainable adversarial augmentation
+strategy. Our algorithm was evaluated on the MICCAI Instrument Segmentation
+Challenge datasets Robust-MIS 2019 and EndoVis 2017. Our results demonstrate
+that adding unlabelled data for training purposes allows us to surpass the
+performance of fully supervised approaches which are limited by the
+availability of training data in these challenges. SegMatch also outperforms a
+range of state-of-the-art semi-supervised learning semantic segmentation models
+in different labelled to unlabelled data ratios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint under review, 12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training neural networks with end-to-end optical backpropagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Spall, Xianxin Guo, A. I. Lvovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optics is an exciting route for the next generation of computing hardware for
+machine learning, promising several orders of magnitude enhancement in both
+computational speed and energy efficiency. However, to reach the full capacity
+of an optical neural network it is necessary that the computing not only for
+the inference, but also for the training be implemented optically. The primary
+algorithm for training a neural network is backpropagation, in which the
+calculation is performed in the order opposite to the information flow for
+inference. While straightforward in a digital computer, optical implementation
+of backpropagation has so far remained elusive, particularly because of the
+conflicting requirements for the optical element that implements the nonlinear
+activation function. In this work, we address this challenge for the first time
+with a surprisingly simple and generic scheme. Saturable absorbers are employed
+for the role of the activation units, and the required properties are achieved
+through a pump-probe process, in which the forward propagating signal acts as
+the pump and backward as the probe. Our approach is adaptable to various analog
+platforms, materials, and network structures, and it demonstrates the
+possibility of constructing neural networks entirely reliant on analog optical
+processes for both training and inference tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Layer Saliency in Language <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth M. Hou, Gregory Castanon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a strategy for identifying textual saliency in
+large-scale language models applied to classification tasks. In visual networks
+where saliency is more well-studied, saliency is naturally localized through
+the convolutional layers of the network; however, the same is not true in
+modern transformer-stack networks used to process natural language. We adapt
+gradient-based saliency methods for these networks, propose a method for
+evaluating the degree of semantic coherence of each layer, and demonstrate
+consistent improvement over numerous other methods for textual saliency on
+multiple benchmark classification datasets. Our approach requires no additional
+training or access to labelled data, and is comparatively very computationally
+efficient.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformer-based Target-Speaker Automatic Speech Recognition for
+  Single-Channel Audio 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zhang, Krishna C. Puvvada, Vitaly Lavrukhin, Boris Ginsburg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose CONF-TSASR, a non-autoregressive end-to-end time-frequency domain
+architecture for single-channel target-speaker automatic speech recognition
+(TS-ASR). The model consists of a TitaNet based speaker embedding module, a
+Conformer based masking as well as ASR modules. These modules are jointly
+optimized to transcribe a target-speaker, while ignoring speech from other
+speakers. For training we use Connectionist Temporal Classification (CTC) loss
+and introduce a scale-invariant spectrogram reconstruction loss to encourage
+the model better separate the target-speaker's spectrogram from mixture. We
+obtain state-of-the-art target-speaker word error rate (TS-WER) on
+WSJ0-2mix-extr (4.2%). Further, we report for the first time TS-WER on
+WSJ0-3mix-extr (12.4%), LibriSpeech2Mix (4.2%) and LibriSpeech3Mix (7.6%)
+datasets, establishing new benchmarks for TS-ASR. The proposed model will be
+open-sourced through NVIDIA NeMo toolkit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Pedestrian Trajectory Prediction Methods for the Application
+  in Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Uhlemann, Felix Fent, Markus Lienkamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the state of the art in the field of pedestrian trajectory
+prediction is evaluated alongside the constant velocity model (CVM) with
+respect to its applicability in autonomous vehicles. The evaluation is
+conducted on the widely-used ETH/UCY dataset where the Average Displacement
+Error (ADE) and the Final Displacement Error (FDE) are reported. To align with
+requirements in real-world applications, modifications are made to the input
+features of the initially proposed models. An ablation study is conducted to
+examine the influence of the observed motion history on the prediction
+performance, thereby establishing a better understanding of its impact.
+Additionally, the inference time of each model is measured to evaluate the
+scalability of each model when confronted with varying amounts of agents. The
+results demonstrate that simple models remain competitive when generating
+single trajectories, and certain features commonly thought of as useful have
+little impact on the overall performance across different architectures. Based
+on these findings, recommendations are proposed to guide the future development
+of trajectory prediction algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the IEEE Transactions on Intelligent Transportation
+  Systems (T-ITS); 9 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Representations for Spatio-Temporal Visual Attention
+  Modeling and Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel-Ángel Fernández-Torres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This PhD. Thesis concerns the study and development of hierarchical
+representations for spatio-temporal visual attention modeling and understanding
+in video sequences. More specifically, we propose two computational models for
+visual attention. First, we present a generative probabilistic model for
+context-aware visual attention modeling and understanding. Secondly, we develop
+a deep network architecture for visual attention modeling, which first
+estimates top-down spatio-temporal visual attention, and ultimately serves for
+modeling attention in the temporal domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Morphological Identification of Extended Radio
+  Galaxies using Weak Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhel Gupta, Zeeshan Hayder, Ray P. Norris, Minh Huynh, Lars Petersson, X. Rosalind Wang, Heinz Andernach, Bärbel S. Koribalski, Miranda Yew, Evan J. Crawford
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present work discusses the use of a weakly-supervised deep learning
+algorithm that reduces the cost of labelling pixel-level masks for complex
+radio galaxies with multiple components. The algorithm is trained on weak
+class-level labels of radio galaxies to get class activation maps (CAMs). The
+CAMs are further refined using an inter-pixel relations network (IRNet) to get
+instance segmentation masks over radio galaxies and the positions of their
+infrared hosts. We use data from the Australian Square Kilometre Array
+Pathfinder (ASKAP) telescope, specifically the Evolutionary Map of the Universe
+(EMU) Pilot Survey, which covered a sky area of 270 square degrees with an RMS
+sensitivity of 25-35 $\mu$Jy/beam. We demonstrate that weakly-supervised deep
+learning algorithms can achieve high accuracy in predicting pixel-level
+information, including masks for the extended radio emission encapsulating all
+galaxy components and the positions of the infrared host galaxies. We evaluate
+the performance of our method using mean Average Precision (mAP) across
+multiple classes at a standard intersection over union (IoU) threshold of 0.5.
+We show that the model achieves a mAP$_{50}$ of 67.5\% and 76.8\% for radio
+masks and infrared host positions, respectively. The network architecture can
+be found at the following link: https://github.com/Nikhel1/Gal-CAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figues, accepted for publication in PASA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sound propagation in realistic interactive 3D scenes with parameterized
+  sources using deep neural operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolas Borrel-Jensen, Somdatta Goswami, Allan P. Engsig-Karup, George Em Karniadakis, Cheol-Ho Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the challenge of sound propagation simulations in $3$D virtual
+rooms with moving sources, which have applications in virtual/augmented
+reality, game audio, and spatial computing. Solutions to the wave equation can
+describe wave phenomena such as diffraction and interference. However,
+simulating them using conventional numerical discretization methods with
+hundreds of source and receiver positions is intractable, making stimulating a
+sound field with moving sources impractical. To overcome this limitation, we
+propose using deep operator networks to approximate linear wave-equation
+operators. This enables the rapid prediction of sound propagation in realistic
+3D acoustic scenes with moving sources, achieving millisecond-scale
+computations. By learning a compact surrogate model, we avoid the offline
+calculation and storage of impulse responses for all relevant source/listener
+pairs. Our experiments, including various complex scene geometries, show good
+agreement with reference solutions, with root mean squared errors ranging from
+0.02 Pa to 0.10 Pa. Notably, our method signifies a paradigm shift as no prior
+machine learning approach has achieved precise predictions of complete wave
+fields within realistic domains. We anticipate that our findings will drive
+further exploration of deep neural operator methods, advancing research in
+immersive user experiences within virtual environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Effect of Data Impurity on the Detection Performances of
+  Mental Disorders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Kumar Gupta, Rohit Sinha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The primary method for identifying mental disorders automatically has
+traditionally involved using binary classifiers. These classifiers are trained
+using behavioral data obtained from an interview setup. In this training
+process, data from individuals with the specific disorder under consideration
+are categorized as the positive class, while data from all other participants
+constitute the negative class. In practice, it is widely recognized that
+certain mental disorders share similar symptoms, causing the collected
+behavioral data to encompass a variety of attributes associated with multiple
+disorders. Consequently, attributes linked to the targeted mental disorder
+might also be present within the negative class. This data impurity may lead to
+sub-optimal training of the classifier for a mental disorder of interest. In
+this study, we investigate this hypothesis in the context of major depressive
+disorder (MDD) and post-traumatic stress disorder detection (PTSD). The results
+show that upon removal of such data impurity, MDD and PTSD detection
+performances are significantly improved.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An out-of-distribution discriminator based on Bayesian neural network
+  epistemic uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethan Ancell, Christopher Bennett, Bert Debusschere, Sapan Agarwal, Park Hays, T. Patrick Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have revolutionized the field of machine learning with
+increased predictive capability. In addition to improving the predictions of
+neural networks, there is a simultaneous demand for reliable uncertainty
+quantification on estimates made by machine learning methods such as neural
+networks. Bayesian neural networks (BNNs) are an important type of neural
+network with built-in capability for quantifying uncertainty. This paper
+discusses aleatoric and epistemic uncertainty in BNNs and how they can be
+calculated. With an example dataset of images where the goal is to identify the
+amplitude of an event in the image, it is shown that epistemic uncertainty
+tends to be lower in images which are well-represented in the training dataset
+and tends to be high in images which are not well-represented. An algorithm for
+out-of-distribution (OoD) detection with BNN epistemic uncertainty is
+introduced along with various experiments demonstrating factors influencing the
+OoD detection capability in a BNN. The OoD detection capability with epistemic
+uncertainty is shown to be comparable to the OoD detection in the discriminator
+network of a generative adversarial network (GAN) with comparable network
+architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Exact Kernel Equivalence for Finite Classification Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00824v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00824v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Bell, Michael Geyer, David Glickenstein, Amanda Fernandez, Juston Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the equivalence between neural networks and kernel methods by
+deriving the first exact representation of any finite-size parametric
+classification model trained with gradient descent as a kernel machine. We
+compare our exact representation to the well-known Neural Tangent Kernel (NTK)
+and discuss approximation error relative to the NTK and other non-exact path
+kernel formulations. We experimentally demonstrate that the kernel can be
+computed for realistic networks up to machine precision. We use this exact
+kernel to show that our theoretical contribution can provide useful insights
+into the predictions made by neural networks, particularly the way in which
+they generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in
+  Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Deep Learning: From Activations to <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00722v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00722v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Schneider, Michalis Vlachos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past decade has witnessed remarkable advancements in deep learning, owing
+to the emergence of various architectures, layers, objectives, and optimization
+techniques. These consist of a multitude of variations of attention,
+normalization, skip connections, transformer, and self-supervised learning
+methods, among others. Our goal is to furnish a comprehensive survey of
+significant recent contributions in these domains to individuals with a
+fundamental grasp of deep learning. Our aspiration is that an integrated and
+comprehensive approach of influential recent works will facilitate the
+formation of new connections between different areas of deep learning. In our
+discussion, we discuss multiple patterns that summarize the key strategies for
+many of the successful innovations over the last decade. We also include a
+discussion on recent commercially built, closed-source models such as OpenAI's
+GPT-4 and Google's PaLM 2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse and Low-Rank High-Order Tensor Regression via Parallel Proximal
+  Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1911.12965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1911.12965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Zhang, Yinghao Cai, Zhaoyang Wang, Beilun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, tensor data (or multidimensional array) have been generated in many
+modern applications, such as functional magnetic resonance imaging (fMRI) in
+neuroscience and videos in video analysis. Many efforts are made in recent
+years to predict the relationship between tensor features and univariate
+responses. However, previously proposed methods either lose structural
+information within tensor data or have prohibitively expensive time costs,
+especially for large-scale data with high-order structures. To address such
+problems, we propose the Sparse and Low-rank Tensor Regression (SLTR) model.
+Our model enforces sparsity and low-rankness of the tensor coefficient by
+directly applying $\ell_1$ norm and tensor nuclear norm, such that it preserves
+structural information of the tensor. To make the solving procedure scalable
+and efficient, SLTR makes use of the proximal gradient method, which can be
+easily implemented parallelly. We evaluate SLTR on several simulated datasets
+and one video action recognition dataset. Experiment results show that,
+compared with previous models, SLTR can obtain a better solution with much
+fewer time costs. Moreover, our model's predictions exhibit meaningful
+interpretations on the video dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Brain Tumor Segmentation (BraTS) Challenge 2023: Local Synthesis of
+  Healthy Brain Tissue via Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Kofler, Felix Meissen, Felix Steinbauer, Robert Graf, Eva Oswald, Ezequiel de da Rosa, Hongwei Bran Li, Ujjwal Baid, Florian Hoelzl, Oezguen Turgut, Izabela Horvath, Diana Waldmannstetter, Christina Bukas, Maruf Adewole, Syed Muhammad Anwar, Anastasia Janas, Anahita Fathi Kazerooni, Dominic LaBella, Ahmed W Moawad, Keyvan Farahani, James Eddy, Timothy Bergquist, Verena Chung, Russell Takeshi Shinohara, Farouk Dako, Walter Wiggins, Zachary Reitman, Chunhao Wang, Xinyang Liu, Zhifan Jiang, Ariana Familiar, Gian-Marco Conte, Elaine Johanson, Zeke Meier, Christos Davatzikos, John Freymann, Justin Kirby, Michel Bilello, Hassan M Fathallah-Shaykh, Roland Wiest, Jan Kirschke, Rivka R Colen, Aikaterini Kotrotsou, Pamela Lamontagne, Daniel Marcus, Mikhail Milchenko, Arash Nazeri, Marc-André Weber, Abhishek Mahajan, Suyash Mohan, John Mongan, Christopher Hess, Soonmee Cha, Javier Villanueva-Meyer, Errol Colak, Priscila Crivellaro, Andras Jakab, Jake Albrecht, Udunna Anazodo, Mariam Aboian, Juan Eugenio Iglesias, Koen Van Leemput, Spyridon Bakas, Daniel Rueckert, Benedikt Wiestler, Ivan Ezhov, Marie Piraud, Bjoern Menze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A myriad of algorithms for the automatic analysis of brain MR images is
+available to support clinicians in their decision-making. For brain tumor
+patients, the image acquisition time series typically starts with a scan that
+is already pathological. This poses problems, as many algorithms are designed
+to analyze healthy brains and provide no guarantees for images featuring
+lesions. Examples include but are not limited to algorithms for brain anatomy
+parcellation, tissue segmentation, and brain extraction. To solve this dilemma,
+we introduce the BraTS 2023 inpainting challenge. Here, the participants' task
+is to explore inpainting techniques to synthesize healthy brain scans from
+lesioned ones. The following manuscript contains the task formulation, dataset,
+and submission procedure. Later it will be updated to summarize the findings of
+the challenge. The challenge is organized as part of the BraTS 2023 challenge
+hosted at the MICCAI 2023 conference in Vancouver, Canada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding CNN Hidden Neuron Activations Using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, demystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Invasive Fairness in Learning through the Lens of Data Drift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17566v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17566v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Yang, Alexandra Meliou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) models are widely employed to drive many modern data
+systems. While they are undeniably powerful tools, ML models often demonstrate
+imbalanced performance and unfair behaviors. The root of this problem often
+lies in the fact that different subpopulations commonly display divergent
+trends: as a learning algorithm tries to identify trends in the data, it
+naturally favors the trends of the majority groups, leading to a model that
+performs poorly and unfairly for minority populations. Our goal is to improve
+the fairness and trustworthiness of ML models by applying only non-invasive
+interventions, i.e., without altering the data or the learning algorithm. We
+use a simple but key insight: the divergence of trends between different
+populations, and, consecutively, between a learned model and minority
+populations, is analogous to data drift, which indicates the poor conformance
+between parts of the data and the trained model. We explore two strategies
+(model-splitting and reweighing) to resolve this drift, aiming to improve the
+overall conformance of models to the underlying data. Both our methods
+introduce novel ways to employ the recently-proposed data profiling primitive
+of Conformance Constraints. Our experimental evaluation over 7 real-world
+datasets shows that both DifFair and ConFair improve the fairness of ML models.
+We demonstrate scenarios where DifFair has an edge, though ConFair has the
+greatest practical impact and outperforms other baselines. Moreover, as a
+model-agnostic technique, ConFair stays robust when used against different
+models than the ones on which the weights have been learned, which is not the
+case for other state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-complexity subspace-descent over symmetric positive definite
+  manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yogesh Darmwal, Ketan Rajawat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work puts forth low-complexity Riemannian subspace descent algorithms
+for the minimization of functions over the symmetric positive definite (SPD)
+manifold. Different from the existing Riemannian gradient descent variants, the
+proposed approach utilizes carefully chosen subspaces that allow the update to
+be written as a product of the Cholesky factor of the iterate and a sparse
+matrix. The resulting updates avoid the costly matrix operations like matrix
+exponentiation and dense matrix multiplication, which are generally required in
+almost all other Riemannian optimization algorithms on SPD manifold. We further
+identify a broad class of functions, arising in diverse applications, such as
+kernel matrix learning, covariance estimation of Gaussian distributions,
+maximum likelihood parameter estimation of elliptically contoured
+distributions, and parameter estimation in Gaussian mixture model problems,
+over which the Riemannian gradients can be calculated efficiently. The proposed
+uni-directional and multi-directional Riemannian subspace descent variants
+incur per-iteration complexities of $\mathcal{O}(n)$ and $\mathcal{O}(n^2)$
+respectively, as compared to the $\mathcal{O}(n^3)$ or higher complexity
+incurred by all existing Riemannian gradient descent variants. The superior
+runtime and low per-iteration complexity of the proposed algorithms is also
+demonstrated via numerical tests on large-scale covariance estimation problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connectivity Optimized Nested Graph Networks for Crystal Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Ruff, Patrick Reiser, Jan Stühmer, Pascal Friederich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have been applied to a large variety of
+applications in materials science and chemistry. Here, we recapitulate the
+graph construction for crystalline (periodic) materials and investigate its
+impact on the GNNs model performance. We suggest the asymmetric unit cell as a
+representation to reduce the number of atoms by using all symmetries of the
+system. This substantially reduced the computational cost and thus time needed
+to train large graph neural networks without any loss in accuracy. Furthermore,
+with a simple but systematically built GNN architecture based on message
+passing and line graph templates, we introduce a general architecture (Nested
+Graph Network, NGN) that is applicable to a wide range of tasks. We show that
+our suggested models systematically improve state-of-the-art results across all
+tasks within the MatBench benchmark. Further analysis shows that optimized
+connectivity and deeper message functions are responsible for the improvement.
+Asymmetric unit cells and connectivity optimization can be generally applied to
+(crystal) graph networks, while our suggested nested graph framework will open
+new ways of systematic comparison of GNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SANSformers: <span class="highlight-title">Self-Supervised</span> Forecasting in Electronic Health Records
+  with Attention-Free Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.13672v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.13672v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yogesh Kumar, Alexander Ilin, Henri Salo, Sangita Kulathinal, Maarit K. Leinonen, Pekka Marttinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of Transformer neural networks to Electronic Health Records
+(EHR) is challenging due to the distinct, multidimensional sequential structure
+of EHR data, often leading to underperformance when compared to simpler linear
+models. Thus, the advantages of Transformers, such as efficient transfer
+learning and improved scalability are not fully exploited in EHR applications.
+To overcome these challenges, we introduce SANSformer, a novel attention-free
+sequential model designed specifically with inductive biases to cater for the
+unique characteristics of EHR data.
+  Our main application area is predicting future healthcare utilization, a
+crucial task for effectively allocating healthcare resources. This task becomes
+particularly difficult when dealing with divergent patient subgroups. These
+subgroups, characterized by unique health trajectories and often small in size,
+such as patients with rare diseases, require specialized modeling approaches.
+To address this, we adopt a self-supervised pretraining strategy, which we term
+Generative Summary Pretraining (GSP). GSP predicts summary statistics of a
+future window in the patient's history based on their past health records, thus
+demonstrating potential to deal with the noisy and complex nature of EHR data.
+We pretrain our models on a comprehensive health registry encompassing close to
+one million patients, before fine-tuning them for specific subgroup prediction
+tasks.
+  In our evaluations, SANSformer consistently outshines strong EHR baselines.
+Importantly, our GSP pretraining method greatly enhances model performance,
+especially for smaller patient subgroups. Our findings underscore the
+substantial potential of bespoke attention-free models and self-supervised
+pretraining for enhancing healthcare utilization predictions across a broad
+range of patient groups.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 8 figures, 5 tables, Submitted to a journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Ratatouille: Recycling Diverse Models for Out-of-Distribution
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10445v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10445v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Ramé, Kartik Ahuja, Jianyu Zhang, Matthieu Cord, Léon Bottou, David Lopez-Paz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models are redefining how AI systems are built. Practitioners now
+follow a standard procedure to build their machine learning solutions: from a
+pre-trained foundation model, they fine-tune the weights on the target task of
+interest. So, the Internet is swarmed by a handful of foundation models
+fine-tuned on many diverse tasks: these individual fine-tunings exist in
+isolation without benefiting from each other. In our opinion, this is a missed
+opportunity, as these specialized models contain rich and diverse features. In
+this paper, we thus propose model ratatouille, a new strategy to recycle the
+multiple fine-tunings of the same foundation model on diverse auxiliary tasks.
+Specifically, we repurpose these auxiliary weights as initializations for
+multiple parallel fine-tunings on the target task; then, we average all
+fine-tuned weights to obtain the final model. This recycling strategy aims at
+maximizing the diversity in weights by leveraging the diversity in auxiliary
+tasks. Empirically, it improves the state of the art on the reference DomainBed
+benchmark for out-of-distribution generalization. Looking forward, this work
+contributes to the emerging paradigm of updatable machine learning where, akin
+to open-source software development, the community collaborates to reliably
+update machine learning models. Our code is released:
+https://github.com/facebookresearch/ModelRatatouille.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 10 tables, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event Abstraction for Enterprise Collaboration Systems to Support Social
+  Process Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Blatt, Patrick Delfmann, Petra Schubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One aim of Process Mining (PM) is the discovery of process models from event
+logs of information systems. PM has been successfully applied to
+process-oriented enterprise systems but is less suited for communication- and
+document-oriented Enterprise Collaboration Systems (ECS). ECS event logs are
+very fine-granular and PM applied to their logs results in spaghetti models. A
+common solution for this is event abstraction, i.e., converting low-level logs
+into more abstract high-level logs before running discovery algorithms. ECS
+logs come with special characteristics that have so far not been fully
+addressed by existing event abstraction approaches. We aim to close this gap
+with a tailored ECS event abstraction (ECSEA) approach that trains a model by
+comparing recorded actual user activities (high-level traces) with the
+system-generated low-level traces (extracted from the ECS). The model allows us
+to automatically convert future low-level traces into an abstracted high-level
+log that can be used for PM. Our evaluation shows that the algorithm produces
+accurate results. ECSEA is a preprocessing method that is essential for the
+interpretation of collaborative work activity in ECS, which we call Social
+Process Mining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding recent deep-learning techniques for identifying collective
+  variables of molecular dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhang, Christof Schütte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-dimensional metastable molecular system can often be characterised by a
+few features of the system, i.e. collective variables (CVs). Thanks to the
+rapid advance in the area of machine learning and deep learning, various deep
+learning-based CV identification techniques have been developed in recent
+years, allowing accurate modelling and efficient simulation of complex
+molecular systems. In this paper, we look at two different categories of deep
+learning-based approaches for finding CVs, either by computing leading
+eigenfunctions of infinitesimal generator or transfer operator associated to
+the underlying dynamics, or by learning an autoencoder via minimisation of
+reconstruction error. We present a concise overview of the mathematics behind
+these two approaches and conduct a comparative numerical study of these two
+approaches on illustrative examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revised version, 14 pages; This is an extended version of the paper
+  submitted to Proceedings in Applied Mathematics and Mechanics (PAMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INFLECT-DGNN: Influencer Prediction with Dynamic Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Tiukhova, Emiliano Penaloza, María Óskarsdóttir, Bart Baesens, Monique Snoeck, Cristián Bravo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging network information for predictive modeling has become widespread
+in many domains. Within the realm of referral and targeted marketing,
+influencer detection stands out as an area that could greatly benefit from the
+incorporation of dynamic network representation due to the ongoing development
+of customer-brand relationships. To elaborate this idea, we introduce
+INFLECT-DGNN, a new framework for INFLuencer prEdiCTion with Dynamic Graph
+Neural Networks that combines Graph Neural Networks (GNN) and Recurrent Neural
+Networks (RNN) with weighted loss functions, the Synthetic Minority
+Oversampling TEchnique (SMOTE) adapted for graph data, and a carefully crafted
+rolling-window strategy. To evaluate predictive performance, we utilize a
+unique corporate data set with networks of three cities and derive a
+profit-driven evaluation methodology for influencer prediction. Our results
+show how using RNN to encode temporal attributes alongside GNNs significantly
+improves predictive performance. We compare the results of various models to
+demonstrate the importance of capturing graph representation, temporal
+dependencies, and using a profit-driven methodology for evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diff-TTSG: Denoising probabilistic integrated speech and gesture
+  synthesis <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09417v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09417v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Mehta, Siyang Wang, Simon Alexanderson, Jonas Beskow, Éva Székely, Gustav Eje Henter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With read-aloud speech synthesis achieving high naturalness scores, there is
+a growing research interest in synthesising spontaneous speech. However, human
+spontaneous face-to-face conversation has both spoken and non-verbal aspects
+(here, co-speech gestures). Only recently has research begun to explore the
+benefits of jointly synthesising these two modalities in a single system. The
+previous state of the art used non-probabilistic methods, which fail to capture
+the variability of human speech and motion, and risk producing oversmoothing
+artefacts and sub-optimal synthesis quality. We present the first
+diffusion-based probabilistic model, called Diff-TTSG, that jointly learns to
+synthesise speech and gestures together. Our method can be trained on small
+datasets from scratch. Furthermore, we describe a set of careful uni- and
+multi-modal subjective tests for evaluating integrated speech and gesture
+synthesis systems, and use them to validate our proposed approach. Please see
+https://shivammehta25.github.io/Diff-TTSG/ for video examples, data, and code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, presented at the ISCA Speech Synthesis Workshop
+  (SSW) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Fourier Analysis on Directed Acyclic Graphs and Posets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bastian Seifert, Chris Wendler, Markus Püschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel form of Fourier analysis, and associated signal processing
+concepts, for signals (or data) indexed by edge-weighted directed acyclic
+graphs (DAGs). This means that our Fourier basis yields an eigendecomposition
+of a suitable notion of shift and convolution operators that we define. DAGs
+are the common model to capture causal relationships between data values and in
+this case our proposed Fourier analysis relates data with its causes under a
+linearity assumption that we define. The definition of the Fourier transform
+requires the transitive closure of the weighted DAG for which several forms are
+possible depending on the interpretation of the edge weights. Examples include
+level of influence, distance, or pollution distribution. Our framework is
+different from prior GSP: it is specific to DAGs and leverages, and extends,
+the classical theory of Moebius inversion from combinatorics. For a
+prototypical application we consider DAGs modeling dynamic networks in which
+edges change over time. Specifically, we model the spread of an infection on
+such a DAG obtained from real-world contact tracing data and learn the
+infection signal from samples assuming sparsity in the Fourier domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Natural Policy Gradients: Towards Sample-Efficient Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Meyer, Daniel D. Scherer, Axel Plinge, Christopher Mutschler, Michael J. Hartmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is a growing field in AI with a lot of potential.
+Intelligent behavior is learned automatically through trial and error in
+interaction with the environment. However, this learning process is often
+costly. Using variational quantum circuits as function approximators
+potentially can reduce this cost. In order to implement this, we propose the
+quantum natural policy gradient (QNPG) algorithm -- a second-order
+gradient-based routine that takes advantage of an efficient approximation of
+the quantum Fisher information matrix. We experimentally demonstrate that QNPG
+outperforms first-order based training on Contextual Bandits environments
+regarding convergence speed and stability and moreover reduces the sample
+complexity. Furthermore, we provide evidence for the practical feasibility of
+our approach by training on a 12-qubit hardware device.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 1st International Workshop on Quantum Machine
+  Learning: From Foundations to Applications (QML@QCE 2023), Bellevue,
+  Washington, USA. 6 pages, 4 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super
+  Learner Equation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Vowels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference is a crucial goal of science, enabling researchers to arrive
+at meaningful conclusions regarding the predictions of hypothetical
+interventions using observational data. Path models, Structural Equation Models
+(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to
+unambiguously specify assumptions regarding the causal structure underlying a
+phenomenon. Unlike DAGs, which make very few assumptions about the functional
+and parametric form, SEM assumes linearity. This can result in functional
+misspecification which prevents researchers from undertaking reliable effect
+size estimation. In contrast, we propose Super Learner Equation Modeling, a
+path modeling technique integrating machine learning Super Learner ensembles.
+We empirically demonstrate its ability to provide consistent and unbiased
+estimates of causal effects, its competitive performance for linear models when
+compared with SEM, and highlight its superiority over SEM when dealing with
+non-linear relationships. We provide open-source code, and a tutorial notebook
+with example usage, accentuating the easy-to-use nature of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu, Laura Forastiere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inverse problem for parameters identification in a modified SIRD
+  epidemic model using ensemble neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.00407v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.00407v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marian Petrica, Ionel Popescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a parameter identification methodology of the SIRD
+model, an extension of the classical SIR model, that considers the deceased as
+a separate category. In addition, our model includes one parameter which is the
+ratio between the real total number of infected and the number of infected that
+were documented in the official statistics.
+  Due to many factors, like governmental decisions, several variants
+circulating, opening and closing of schools, the typical assumption that the
+parameters of the model stay constant for long periods of time is not
+realistic. Thus our objective is to create a method which works for short
+periods of time. In this scope, we approach the estimation relying on the
+previous 7 days of data and then use the identified parameters to make
+predictions.
+  To perform the estimation of the parameters we propose the average of an
+ensemble of neural networks. Each neural network is constructed based on a
+database built by solving the SIRD for 7 days, with random parameters. In this
+way, the networks learn the parameters from the solution of the SIRD model.
+  Lastly we use the ensemble to get estimates of the parameters from the real
+data of Covid19 in Romania and then we illustrate the predictions for different
+periods of time, from 10 up to 45 days, for the number of deaths. The main goal
+was to apply this approach on the analysis of COVID-19 evolution in Romania,
+but this was also exemplified on other countries like Hungary, Czech Republic
+and Poland with similar results.
+  The results are backed by a theorem which guarantees that we can recover the
+parameters of the model from the reported data. We believe this methodology can
+be used as a general tool for dealing with short term predictions of infectious
+diseases or in other compartmental models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the final version of this paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imaginary Hindsight Experience Replay: Curious Model-based Learning for
+  Sparse Reward Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.02414v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.02414v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert McCarthy, Qiang Wang, Stephen J. Redmond
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based reinforcement learning is a promising learning strategy for
+practical robotic applications due to its improved data-efficiency versus
+model-free counterparts. However, current state-of-the-art model-based methods
+rely on shaped reward signals, which can be difficult to design and implement.
+To remedy this, we propose a simple model-based method tailored for
+sparse-reward multi-goal tasks that foregoes the need for complicated reward
+engineering. This approach, termed Imaginary Hindsight Experience Replay,
+minimises real-world interactions by incorporating imaginary data into policy
+updates. To improve exploration in the sparse-reward setting, the policy is
+trained with standard Hindsight Experience Replay and endowed with
+curiosity-based intrinsic rewards. Upon evaluation, this approach provides an
+order of magnitude increase in data-efficiency on average versus the
+state-of-the-art model-free method in the benchmark OpenAI Gym Fetch Robotics
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ESP: Exploiting Symmetry Prior for Multi-Agent Reinforcement Learning <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16186v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16186v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Yu, Rongye Shi, Pu Feng, Yongkai Tian, Jie Luo, Wenjun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent reinforcement learning (MARL) has achieved promising results in
+recent years. However, most existing reinforcement learning methods require a
+large amount of data for model training. In addition, data-efficient
+reinforcement learning requires the construction of strong inductive biases,
+which are ignored in the current MARL approaches. Inspired by the symmetry
+phenomenon in multi-agent systems, this paper proposes a framework for
+exploiting prior knowledge by integrating data augmentation and a well-designed
+consistency loss into the existing MARL methods. In addition, the proposed
+framework is model-agnostic and can be applied to most of the current MARL
+algorithms. Experimental tests on multiple challenging tasks demonstrate the
+effectiveness of the proposed framework. Moreover, the proposed framework is
+applied to a physical multi-robot testbed to show its superiority.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Harshit Nigam, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hierarchical Block Distance Model for Ultra Low-Dimensional Graph
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.05885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.05885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Nakis, Abdulkadir Çelikkanat, Sune Lehmann Jørgensen, Morten Mørup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Representation Learning (GRL) has become central for characterizing
+structures of complex networks and performing tasks such as link prediction,
+node classification, network reconstruction, and community detection. Whereas
+numerous generative GRL models have been proposed, many approaches have
+prohibitive computational requirements hampering large-scale network analysis,
+fewer are able to explicitly account for structure emerging at multiple scales,
+and only a few explicitly respect important network properties such as
+homophily and transitivity. This paper proposes a novel scalable graph
+representation learning method named the Hierarchical Block Distance Model
+(HBDM). The HBDM imposes a multiscale block structure akin to stochastic block
+modeling (SBM) and accounts for homophily and transitivity by accurately
+approximating the latent distance model (LDM) throughout the inferred
+hierarchy. The HBDM naturally accommodates unipartite, directed, and bipartite
+networks whereas the hierarchy is designed to ensure linearithmic time and
+space complexity enabling the analysis of very large-scale networks. We
+evaluate the performance of the HBDM on massive networks consisting of millions
+of nodes. Importantly, we find that the proposed HBDM framework significantly
+outperforms recent scalable approaches in all considered downstream tasks.
+Surprisingly, we observe superior performance even imposing ultra-low
+two-dimensional embeddings facilitating accurate direct and hierarchical-aware
+network visualization and interpretation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Novel Convolutional Neural Network Architecture with a Continuous
+  Symmetry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Liu, Hang Shao, Bing Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new Convolutional Neural Network (ConvNet)
+architecture inspired by a class of partial differential equations (PDEs)
+called quasi-linear hyperbolic systems. With comparable performance on the
+image classification task, it allows for the modification of the weights via a
+continuous group of symmetry. This is a significant shift from traditional
+models where the architecture and weights are essentially fixed. We wish to
+promote the (internal) symmetry as a new desirable property for a neural
+network, and to draw attention to the PDE perspective in analyzing and
+interpreting ConvNets in the broader Deep Learning community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 3rd CAAI International Conference on Artificial
+  Intelligence (CICAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01937v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01937v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhong Chen, Fengbei Liu, Hu Wang, Chong Wang, Yu Tian, Yuyuan Liu, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods have shown outstanding classification accuracy in
+medical imaging problems, which is largely attributed to the availability of
+large-scale datasets manually annotated with clean labels. However, given the
+high cost of such manual annotation, new medical imaging classification
+problems may need to rely on machine-generated noisy labels extracted from
+radiology reports. Indeed, many Chest X-ray (CXR) classifiers have already been
+modelled from datasets with noisy labels, but their training procedure is in
+general not robust to noisy-label samples, leading to sub-optimal models.
+Furthermore, CXR datasets are mostly multi-label, so current noisy-label
+learning methods designed for multi-class problems cannot be easily adapted. In
+this paper, we propose a new method designed for the noisy multi-label CXR
+learning, which detects and smoothly re-labels samples from the dataset, which
+is then used to train common multi-label classifiers. The proposed method
+optimises a bag of multi-label descriptors (BoMD) to promote their similarity
+with the semantic descriptors produced by BERT models from the multi-label
+image annotation. Our experiments on diverse noisy multi-label training sets
+and clean testing sets show that our model has state-of-the-art accuracy and
+robustness in many CXR multi-label classification benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/cyh-0/BoMD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The False Dawn: Reevaluating Google's Reinforcement Learning for Chip
+  Macro Placement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09633v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09633v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Igor L. Markov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) for physical design of silicon chips in a Google
+2021 Nature paper stirred controversy due to poorly documented claims that
+raised eyebrows and attracted critical media coverage. The Nature paper
+withheld most inputs needed to produce reported results and some critical steps
+in the methodology. But two separate evaluations filled in the gaps and
+demonstrated that Google RL lags behind human designers, behind a well-known
+algorithm (Simulated Annealing), and also behind generally-available commercial
+software, while taking longer to run. Crosschecked data show that the integrity
+of the Nature paper is substantially undermined owing to errors in conduct,
+analysis and reporting. Before publishing, Google rebuffed internal allegations
+of fraud.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure, 4 tables (new material and refs in Section 8,
+  quotes from new court ruling in new Table 4, better wording in Sections 1-4)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne, Luciana Ferrer, Matías Vera, Pablo Piantanida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D-Aware Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14797v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14797v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sherwin Bahmani, Jeong Joon Park, Despoina Paschalidou, Hao Tang, Gordon Wetzstein, Leonidas Guibas, Luc Van Gool, Radu Timofte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models have emerged as an essential building block for many image
+synthesis and editing tasks. Recent advances in this field have also enabled
+high-quality 3D or video content to be generated that exhibits either
+multi-view or temporal consistency. With our work, we explore 4D generative
+adversarial networks (GANs) that learn unconditional generation of 3D-aware
+videos. By combining neural implicit representations with time-aware
+discriminator, we develop a GAN framework that synthesizes 3D video supervised
+only with monocular videos. We show that our method learns a rich embedding of
+decomposable 3D structures and motions that enables new visual effects of
+spatio-temporal renderings while producing imagery with quality comparable to
+that of existing 3D or video GANs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; Project page: https://sherwinbahmani.github.io/3dvidgen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rickrolling the Artist: Injecting Backdoors into Text Encoders for
+  Text-to-Image Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.02408v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.02408v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Struppek, Dominik Hintersdorf, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While text-to-image synthesis currently enjoys great popularity among
+researchers and the general public, the security of these models has been
+neglected so far. Many text-guided image generation models rely on pre-trained
+text encoders from external sources, and their users trust that the retrieved
+models will behave as promised. Unfortunately, this might not be the case. We
+introduce backdoor attacks against text-guided generative models and
+demonstrate that their text encoders pose a major tampering risk. Our attacks
+only slightly alter an encoder so that no suspicious model behavior is apparent
+for image generations with clean prompts. By then inserting a single character
+trigger into the prompt, e.g., a non-Latin character or emoji, the adversary
+can trigger the model to either generate images with pre-defined attributes or
+images following a hidden, potentially malicious description. We empirically
+demonstrate the high effectiveness of our attacks on Stable Diffusion and
+highlight that the injection process of a single backdoor takes less than two
+minutes. Besides phrasing our approach solely as an attack, it can also force
+an encoder to forget phrases related to certain concepts, such as nudity or
+violence, and help to make image generation safer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Effect of Initialization: The Scaling Path of 2-Layer Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Neumayer, Lénaïc Chizat, Michael Unser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In supervised learning, the regularization path is sometimes used as a
+convenient theoretical proxy for the optimization path of gradient descent
+initialized from zero. In this paper, we study a modification of the
+regularization path for infinite-width 2-layer ReLU neural networks with
+nonzero initial distribution of the weights at different scales. By exploiting
+a link with unbalanced optimal-transport theory, we show that, despite the
+non-convexity of the 2-layer network training, this problem admits an
+infinite-dimensional convex counterpart. We formulate the corresponding
+functional-optimization problem and investigate its main properties. In
+particular, we show that, as the scale of the initialization ranges between $0$
+and $+\infty$, the associated path interpolates continuously between the
+so-called kernel and rich regimes. Numerical experiments confirm that, in our
+setting, the scaling path and the final states of the optimization path behave
+similarly, even beyond these extreme points.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing the Maximal Entropy Moment Method for Rarefied Gas Dynamics
+  at Single-Precision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Candi Zheng, Wang Yang, Shiyi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing extended hydrodynamics equations valid for both dense and rarefied
+gases remains a great challenge. A systematical solution for this challenge is
+the moment method describing both dense and rarefied gas behaviors with moments
+of gas molecule velocity distributions. Among moment methods, the maximal
+entropy moment method (MEM) stands out for its well-posedness and stability,
+which utilizes velocity distributions with maximized entropy. However, finding
+such distributions requires solving an ill-conditioned and
+computation-demanding optimization problem. This problem causes numerical
+overflow and breakdown when the numerical precision is insufficient, especially
+for flows like high-speed shock waves. It also prevents modern GPUs from
+accelerating optimization with their enormous single floating-point precision
+computation power. This paper aims to stabilize MEM, making it practical for
+simulating very strong normal shock waves on modern GPUs at single precision.
+We propose the gauge transformations for MEM, making the optimization less
+ill-conditioned. We also tackle numerical overflow and breakdown by adopting
+the canonical form of distribution and Newton's modified optimization method.
+With these techniques, we achieved a single-precision GPU simulation of a Mach
+10 shock wave with 35 moments MEM, surpassing the previous double-precision
+results of Mach 4. Moreover, we argued that over-refined spatial mesh degrades
+both the accuracy and stability of MEM. Overall, this paper makes the maximal
+entropy moment method practical for simulating very strong normal shock waves
+on modern GPUs at single-precision, with significant stability improvement
+compared to previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time-Parameterized Convolutional Neural Networks for Irregularly Sampled
+  Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chrysoula Kosma, Giannis Nikolentzos, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Irregularly sampled multivariate time series are ubiquitous in several
+application domains, leading to sparse, not fully-observed and non-aligned
+observations across different variables. Standard sequential neural network
+architectures, such as recurrent neural networks (RNNs) and convolutional
+neural networks (CNNs), consider regular spacing between observation times,
+posing significant challenges to irregular time series modeling. While most of
+the proposed architectures incorporate RNN variants to handle irregular time
+intervals, convolutional neural networks have not been adequately studied in
+the irregular sampling setting. In this paper, we parameterize convolutional
+layers by employing time-explicitly initialized kernels. Such general functions
+of time enhance the learning process of continuous-time hidden dynamics and can
+be efficiently incorporated into convolutional kernel weights. We, thus,
+propose the time-parameterized convolutional neural network (TPCNN), which
+shares similar properties with vanilla convolutions but is carefully designed
+for irregularly sampled time series. We evaluate TPCNN on both interpolation
+and classification tasks involving real-world irregularly sampled multivariate
+time series datasets. Our experimental results indicate the competitive
+performance of the proposed TPCNN model which is also significantly more
+efficient than other state-of-the-art methods. At the same time, the proposed
+architecture allows the interpretability of the input series by leveraging the
+combination of learnable time functions that improve the network performance in
+subsequent tasks and expedite the inaugural application of convolutions in this
+field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AttentionViz: A Global View of <span class="highlight-title">Transformer</span> Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Yeh, Yida Chen, Aoyu Wu, Cynthia Chen, Fernanda Viégas, Martin Wattenberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are revolutionizing machine learning, but their inner
+workings remain mysterious. In this work, we present a new visualization
+technique designed to help researchers understand the self-attention mechanism
+in transformers that allows these models to learn rich, contextual
+relationships between elements of a sequence. The main idea behind our method
+is to visualize a joint embedding of the query and key vectors used by
+transformer models to compute attention. Unlike previous attention
+visualization techniques, our approach enables the analysis of global patterns
+across multiple input sequences. We create an interactive visualization tool,
+AttentionViz (demo: http://attentionviz.com), based on these joint query-key
+embeddings, and use it to study attention mechanisms in both language and
+vision transformers. We demonstrate the utility of our approach in improving
+model understanding and offering new insights about query-key interactions
+through several application scenarios and expert feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Representation Learning for Periodic Time Series with Floss: A
+  Frequency Domain Regularization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01011v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01011v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunwei Yang, Xiaoxu Chen, Lijun Sun, Hongyu Yang, Yuankai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series analysis is a fundamental task in various application domains,
+and deep learning approaches have demonstrated remarkable performance in this
+area. However, many real-world time series data exhibit significant periodic or
+quasi-periodic dynamics that are often not adequately captured by existing deep
+learning-based solutions. This results in an incomplete representation of the
+underlying dynamic behaviors of interest. To address this gap, we propose an
+unsupervised method called Floss that automatically regularizes learned
+representations in the frequency domain. The Floss method first automatically
+detects major periodicities from the time series. It then employs periodic
+shift and spectral density similarity measures to learn meaningful
+representations with periodic consistency. In addition, Floss can be easily
+incorporated into both supervised, semi-supervised, and unsupervised learning
+frameworks. We conduct extensive experiments on common time series
+classification, forecasting, and anomaly detection tasks to demonstrate the
+effectiveness of Floss. We incorporate Floss into several representative deep
+learning solutions to justify our design choices and demonstrate that it is
+capable of automatically discovering periodic dynamics and improving
+state-of-the-art deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoML4ETC: Automated Neural Architecture Search for Real-World
+  Encrypted Traffic Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02182v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02182v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Malekghaini, Elham Akbari, Mohammad A. Salahuddin, Noura Limam, Raouf Boutaba, Bertrand Mathieu, Stephanie Moteau, Stephane Tuffin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) has been successfully applied to encrypted network traffic
+classification in experimental settings. However, in production use, it has
+been shown that a DL classifier's performance inevitably decays over time.
+Re-training the model on newer datasets has been shown to only partially
+improve its performance. Manually re-tuning the model architecture to meet the
+performance expectations on newer datasets is time-consuming and requires
+domain expertise. We propose AutoML4ETC, a novel tool to automatically design
+efficient and high-performing neural architectures for encrypted traffic
+classification. We define a novel, powerful search space tailored specifically
+for the near real-time classification of encrypted traffic using packet header
+bytes. We show that with different search strategies over our search space,
+AutoML4ETC generates neural architectures that outperform the state-of-the-art
+encrypted traffic classifiers on several datasets, including public benchmark
+datasets and real-world TLS and QUIC traffic collected from the Orange mobile
+network. In addition to being more accurate, AutoML4ETC's architectures are
+significantly more efficient and lighter in terms of the number of parameters.
+Finally, we make AutoML4ETC publicly available for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A slightly different version is under review for possible publication
+  in IEEE TNSM journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaMMUT: A Simple Architecture for Joint Learning for MultiModal Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16839v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16839v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weicheng Kuo, AJ Piergiovanni, Dahun Kim, Xiyang Luo, Ben Caine, Wei Li, Abhijit Ogale, Luowei Zhou, Andrew Dai, Zhifeng Chen, Claire Cui, Anelia Angelova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of language models have moved from encoder-decoder to
+decoder-only designs. In addition, we observe that the two most popular
+multimodal tasks, the generative and contrastive tasks, are nontrivial to
+accommodate in one architecture, and further need adaptations for downstream
+tasks. We propose a novel paradigm of training with a decoder-only model for
+multimodal tasks, which is surprisingly effective in jointly learning of these
+disparate vision-language tasks. This is done with a simple model, called
+MaMMUT. It consists of a single vision encoder and a text decoder, and is able
+to accommodate contrastive and generative learning by a novel two-pass approach
+on the text decoder. We demonstrate that joint learning of these diverse
+objectives is simple, effective, and maximizes the weight-sharing of the model
+across these tasks. Furthermore, the same architecture enables straightforward
+extensions to open-vocabulary object detection and video-language tasks. The
+model tackles a diverse range of tasks, while being modest in capacity. Our
+model achieves the state of the art on image-text and text-image retrieval,
+video question answering and open-vocabulary detection tasks, outperforming
+much larger and more extensively trained foundational models. It shows very
+competitive results on VQA and Video Captioning, especially considering its
+capacity. Ablations confirm the flexibility and advantages of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (
+  https://jmlr.org/tmlr/ ). 18 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object
+  Detection System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanlong Yang, Jianan Liu, Tao Huang, Qing-Long Han, Gang Ma, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In autonomous driving systems, LiDAR and radar play important roles in the
+perception of the surrounding environment. LiDAR provides accurate 3D spatial
+sensing information but cannot work in adverse weather like fog. On the other
+hand, the radar signal can be diffracted when encountering raindrops or mist
+particles thanks to its wavelength, but it suffers from large noise. Recent
+state-of-the-art works reveal that fusion of radar and LiDAR can lead to robust
+detection in adverse weather. The existing works adopt convolutional neural
+network architecture to extract features from each sensor data stream, then
+align and aggregate the two branch features to predict object detection
+results. However, these methods have low accuracy of bounding box estimations
+due to a simple design of label assignment and fusion strategies. In this
+paper, we propose a bird's-eye view fusion learning-based anchor box-free
+object detection system, which fuses the feature derived from the radar
+range-azimuth heatmap and the LiDAR point cloud to estimate the possible
+objects. Different label assignment strategies have been designed to facilitate
+the consistency between the classification of foreground or background anchor
+points and the corresponding bounding box regressions. In addition, the
+performance of the proposed object detector is further enhanced by employing a
+novel interactive transformer module. The superior performance of the methods
+proposed in this paper has been demonstrated using the recently published
+Oxford Radar RobotCar dataset. Our system's average precision significantly
+outperforms the best state-of-the-art method by 13.1% and 19.0% at IoU of 0.8
+under 'Clear+Foggy' training conditions for 'Clear' and 'Foggy' testing,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12306v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12306v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Hu, Khemraj Shukla, George Em Karniadakis, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The curse-of-dimensionality (CoD) taxes computational resources heavily with
+exponentially increasing computational cost as the dimension increases. This
+poses great challenges in solving high-dimensional PDEs as Richard Bellman
+first pointed out over 60 years ago. While there has been some recent success
+in solving numerically partial differential equations (PDEs) in high
+dimensions, such computations are prohibitively expensive, and true scaling of
+general nonlinear PDEs to high dimensions has never been achieved. In this
+paper, we develop a new method of scaling up physics-informed neural networks
+(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called
+Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs
+into pieces corresponding to different dimensions and samples randomly a subset
+of these dimensional pieces in each iteration of training PINNs. We
+theoretically prove the convergence guarantee and other desired properties of
+the proposed method. We experimentally demonstrate that the proposed method
+allows us to solve many notoriously hard high-dimensional PDEs, including the
+Hamilton-Jacobi-Bellman (HJB) and the Schr\"{o}dinger equations in thousands of
+dimensions very fast on a single GPU using the PINNs mesh-free approach. For
+instance, we solve nontrivial nonlinear PDEs (one HJB equation and one
+Black-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using
+SDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD
+can be applied to any current and future variants of PINNs to scale them up for
+arbitrary high-dimensional PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auto-Tables: Synthesizing Multi-Step Transformations to Relationalize
+  Tables without Using Examples <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Li, Yeye He, Cong Yan, Yue Wang, Surajit Chaudhuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relational tables, where each row corresponds to an entity and each column
+corresponds to an attribute, have been the standard for tables in relational
+databases. However, such a standard cannot be taken for granted when dealing
+with tables "in the wild". Our survey of real spreadsheet-tables and web-tables
+shows that over 30% of such tables do not conform to the relational standard,
+for which complex table-restructuring transformations are needed before these
+tables can be queried easily using SQL-based analytics tools. Unfortunately,
+the required transformations are non-trivial to program, which has become a
+substantial pain point for technical and non-technical users alike, as
+evidenced by large numbers of forum questions in places like StackOverflow and
+Excel/Power-BI/Tableau forums.
+  We develop an Auto-Tables system that can automatically synthesize pipelines
+with multi-step transformations (in Python or other languages), to transform
+non-relational tables into standard relational forms for downstream analytics,
+obviating the need for users to manually program transformations. We compile an
+extensive benchmark for this new task, by collecting 244 real test cases from
+user spreadsheets and online forums. Our evaluation suggests that Auto-Tables
+can successfully synthesize transformations for over 70% of test cases at
+interactive speeds, without requiring any input from users, making this an
+effective tool for both technical and non-technical users to prepare data for
+analytics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>full version of a paper accepted to VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Graph Neural Networks for Time Series: Forecasting,
+  Classification, Imputation, and Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Jin, Huan Yee Koh, Qingsong Wen, Daniele Zambon, Cesare Alippi, Geoffrey I. Webb, Irwin King, Shirui Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series are the primary data type used to record dynamic system
+measurements and generated in great volume by both physical sensors and online
+processes (virtual sensors). Time series analytics is therefore crucial to
+unlocking the wealth of information implicit in available data. With the recent
+advancements in graph neural networks (GNNs), there has been a surge in
+GNN-based approaches for time series analysis. These approaches can explicitly
+model inter-temporal and inter-variable relationships, which traditional and
+other deep neural network-based methods struggle to do. In this survey, we
+provide a comprehensive review of graph neural networks for time series
+analysis (GNN4TS), encompassing four fundamental dimensions: forecasting,
+classification, anomaly detection, and imputation. Our aim is to guide
+designers and practitioners to understand, build applications, and advance
+research of GNN4TS. At first, we provide a comprehensive task-oriented taxonomy
+of GNN4TS. Then, we present and discuss representative research works and
+introduce mainstream applications of GNN4TS. A comprehensive discussion of
+potential future research directions completes the survey. This survey, for the
+first time, brings together a vast array of knowledge on GNN-based time series
+research, highlighting foundations, practical applications, and opportunities
+of graph neural networks for time series analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing work; 27 pages, 6 figures, 5 tables; Github page:
+  https://github.com/KimMeen/Awesome-GNN4TS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Feature Engineering Help Quantum Machine Learning for Malware
+  Detection? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Liu, Maksim Eren, Charles Nicholas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing number and sophistication of malware attacks, malware
+detection systems based on machine learning (ML) grow in importance. At the
+same time, many popular ML models used in malware classification are supervised
+solutions. These supervised classifiers often do not generalize well to novel
+malware. Therefore, they need to be re-trained frequently to detect new malware
+specimens, which can be time-consuming. Our work addresses this problem in a
+hybrid framework of theoretical Quantum ML, combined with feature selection
+strategies to reduce the data size and malware classifier training time. The
+preliminary results show that VQC with XGBoost selected features can get a
+78.91% test accuracy on the simulator. The average accuracy for the model
+trained using the features selected with XGBoost was 74% (+- 11.35%) on the IBM
+5 qubits machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Malware Technical Exchange Meeting 2022 (MTEM'22)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Alternative to Variance: Gini Deviation for Risk-averse Policy
+  Gradient 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yudong Luo, Guiliang Liu, Pascal Poupart, Yangchen Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Restricting the variance of a policy's return is a popular choice in
+risk-averse Reinforcement Learning (RL) due to its clear mathematical
+definition and easy interpretability. Traditional methods directly restrict the
+total return variance. Recent methods restrict the per-step reward variance as
+a proxy. We thoroughly examine the limitations of these variance-based methods,
+such as sensitivity to numerical scale and hindering of policy learning, and
+propose to use an alternative risk measure, Gini deviation, as a substitute. We
+study various properties of this new risk measure and derive a policy gradient
+algorithm to minimize it. Empirical evaluation in domains where risk-aversion
+can be clearly defined, shows that our algorithm can mitigate the limitations
+of variance-based risk measures and achieves high return with low risk in terms
+of variance and Gini deviation when others fail to learn a reasonable policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiverseVul: A New Vulnerable Source Code <span class="highlight-title">Dataset</span> for Deep Learning Based
+  Vulnerability Detection <span class="chip">RAID 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizheng Chen, Zhoujie Ding, Lamya Alowain, Xinyun Chen, David Wagner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose and release a new vulnerable source code dataset. We curate the
+dataset by crawling security issue websites, extracting vulnerability-fixing
+commits and source codes from the corresponding projects. Our new dataset
+contains 18,945 vulnerable functions spanning 150 CWEs and 330,492
+non-vulnerable functions extracted from 7,514 commits. Our dataset covers 295
+more projects than all previous datasets combined.
+  Combining our new dataset with previous datasets, we present an analysis of
+the challenges and promising research directions of using deep learning for
+detecting software vulnerabilities. We study 11 model architectures belonging
+to 4 families. Our results show that deep learning is still not ready for
+vulnerability detection, due to high false positive rate, low F1 score, and
+difficulty of detecting hard CWEs. In particular, we demonstrate an important
+generalization challenge for the deployment of deep learning-based models. We
+show that increasing the volume of training data may not further improve the
+performance of deep learning models for vulnerability detection, but might be
+useful to improve the generalization ability to unseen projects.
+  We also identify hopeful future research directions. We demonstrate that
+large language models (LLMs) are a promising research direction for ML-based
+vulnerability detection, outperforming Graph Neural Networks (GNNs) with
+code-structure features in our experiments. Moreover, developing source code
+specific pre-training objectives is a promising research direction to improve
+the vulnerability detection performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at RAID 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WEPRO: Weight Prediction for Efficient Optimization of Hybrid
+  Quantum-Classical Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12449v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12449v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwik Kundu, Debarshi Kundu, Swaroop Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential run time of quantum simulators on classical machines and long
+queue depths and high costs of real quantum devices present significant
+challenges in the effective training of Variational Quantum Algorithms (VQAs)
+like Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and
+Quantum Approximate Optimization Algorithm (QAOA). To address these
+limitations, we propose a new approach, WEPRO (Weight Prediction), which
+accelerates the convergence of VQAs by exploiting regular trends in the
+parameter weights. We introduce two techniques for optimal prediction
+performance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).
+Through extensive experimentation and training of multiple QNN models on
+various datasets, we demonstrate that WEPRO offers a speedup of approximately
+$2.25\times$ compared to standard training methods, while also providing
+improved accuracy (up to $2.3\%$ higher) and loss (up to $6.1\%$ lower) with
+low storage and computational overheads. We also evaluate WEPRO's effectiveness
+in VQE for molecular ground-state energy estimation and in QAOA for graph
+MaxCut. Our results show that WEPRO leads to speed improvements of up to
+$3.1\times$ for VQE and $2.91\times$ for QAOA, compared to traditional
+optimization techniques, while using up to $3.3\times$ less number of shots
+(i.e., repeated circuit executions) per training iteration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emergence of the SVD as an interpretable factorization in deep learning
+  for inverse problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Sule, Richard G. Spencer, Wojciech Czaja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the framework of deep learning we demonstrate the emergence of the
+singular value decomposition (SVD) of the weight matrix as a tool for
+interpretation of neural networks (NN) when combined with the descrambling
+transformation--a recently-developed technique for addressing interpretability
+in noisy parameter estimation neural networks \cite{amey2021neural}. By
+considering the averaging effect of the data passed to the descrambling
+minimization problem, we show that descrambling transformations--in the large
+data limit--can be expressed in terms of the SVD of the NN weights and the
+input autocorrelation matrix. Using this fact, we show that within the class of
+noisy parameter estimation problems the SVD may be the structure through which
+trained networks encode a signal model. We substantiate our theoretical
+findings with empirical evidence from both linear and non-linear signal models.
+Our results also illuminate the connections between a mathematical theory of
+semantic development \cite{saxe2019mathematical} and neural network
+interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Novel Site-Agnostic Multimodal Deep Learning Model to Identify
+  Pro-Eating Disorder Content on Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Feldman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last decade, there has been a vast increase in eating disorder
+diagnoses and eating disorder-attributed deaths, reaching their zenith during
+the Covid-19 pandemic. This immense growth derived in part from the stressors
+of the pandemic but also from increased exposure to social media, which is rife
+with content that promotes eating disorders. This study aimed to create a
+multimodal deep learning model that can determine if a given social media post
+promotes eating disorders based on a combination of visual and textual data. A
+labeled dataset of Tweets was collected from Twitter, upon which twelve deep
+learning models were trained and tested. Based on model performance, the most
+effective deep learning model was the multimodal fusion of the RoBERTa natural
+language processing model and the MaxViT image classification model, attaining
+accuracy and F1 scores of 95.9% and 0.959, respectively. The RoBERTa and MaxViT
+fusion model, deployed to classify an unlabeled dataset of posts from the
+social media sites Tumblr and Reddit, generated results akin to those of
+previous research studies that did not employ artificial intelligence-based
+techniques, indicating that deep learning models can develop insights congruent
+to those of researchers. Additionally, the model was used to conduct a
+timeseries analysis of yet unseen Tweets from eight Twitter hashtags,
+uncovering that, since 2014, the relative abundance of content that promotes
+eating disorders has decreased drastically within those communities. Despite
+this reduction, by 2018, content that promotes eating disorders had either
+stopped declining or increased in ampleness anew on these hashtags.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One-Shot Neural Fields for 3D Object Understanding <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12126v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12126v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valts Blukis, Taeyeop Lee, Jonathan Tremblay, Bowen Wen, In So Kweon, Kuk-Jin Yoon, Dieter Fox, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified and compact scene representation for robotics, where
+each object in the scene is depicted by a latent code capturing geometry and
+appearance. This representation can be decoded for various tasks such as novel
+view rendering, 3D reconstruction (e.g. recovering depth, point clouds, or
+voxel maps), collision checking, and stable grasp prediction. We build our
+representation from a single RGB input image at test time by leveraging recent
+advances in Neural Radiance Fields (NeRF) that learn category-level priors on
+large multiview datasets, then fine-tune on novel objects from one or few
+views. We expand the NeRF model for additional grasp outputs and explore ways
+to leverage this representation for robotics. At test-time, we build the
+representation from a single RGB input image observing the scene from only one
+viewpoint. We find that the recovered representation allows rendering from
+novel views, including of occluded object parts, and also for predicting
+successful stable grasps. Grasp poses can be directly decoded from our latent
+representation with an implicit grasp decoder. We experimented in both
+simulation and real world and demonstrated the capability for robust robotic
+grasping using such compact representation. Website:
+https://nerfgrasp.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/CVF Conference on Computer Vision and Pattern Recognition
+  Workshop (CVPRW) on XRNeRF: Advances in NeRF for the Metaverse 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficiently Sampling the PSD Cone with the Metric Dikin Walk 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbum Kook, Santosh S. Vempala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-definite programs represent a frontier of efficient computation. While
+there has been much progress on semi-definite optimization, with moderate-sized
+instances currently solvable in practice by the interior-point method, the
+basic problem of sampling semi-definite solutions remains a formidable
+challenge. The direct application of known polynomial-time algorithms for
+sampling general convex bodies to semi-definite sampling leads to a
+prohibitively high running time. In addition, known general methods require an
+expensive rounding phase as pre-processing. Here we analyze the Dikin walk, by
+first adapting it to general metrics, then devising suitable metrics for the
+PSD cone with affine constraints. The resulting mixing time and per-step
+complexity are considerably smaller, and by an appropriate choice of the
+metric, the dependence on the number of constraints can be made
+polylogarithmic. We introduce a refined notion of self-concordant matrix
+functions and give rules for combining different metrics. Along the way, we
+further develop the theory of interior-point methods for sampling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fix typos and avoid using Calabi estimates in Section 6</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Destructing Models: Increasing the Costs of Harmful Dual Uses of
+  Foundation Models <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Henderson, Eric Mitchell, Christopher D. Manning, Dan Jurafsky, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A growing ecosystem of large, open-source foundation models has reduced the
+labeled data and technical expertise necessary to apply machine learning to
+many new problems. Yet foundation models pose a clear dual-use risk,
+indiscriminately reducing the costs of building both harmful and beneficial
+machine learning systems. Policy tools such as restricted model access and
+export controls are the primary methods currently used to mitigate such
+dual-use risks. In this work, we review potential safe-release strategies and
+argue that both policymakers and AI researchers would benefit from
+fundamentally new technologies enabling more precise control over the
+downstream usage of open-source foundation models. We propose one such
+approach: the task blocking paradigm, in which foundation models are trained
+with an additional mechanism to impede adaptation to harmful tasks without
+sacrificing performance on desirable tasks. We call the resulting models
+self-destructing models, inspired by mechanisms that prevent adversaries from
+using tools for harmful purposes. We present an algorithm for training
+self-destructing models leveraging techniques from meta-learning and
+adversarial learning, which we call meta-learned adversarial censoring (MLAC).
+In a small-scale experiment, we show MLAC can largely prevent a BERT-style
+model from being re-purposed to perform gender identification without harming
+the model's ability to perform profession classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v1 Presented at the First Workshop of Pre-training: Perspectives,
+  Pitfalls, and Paths Forward (ICML, 2022) and New Frontiers in Adversarial
+  Machine Learning Workshop (ICML, 2022); v2 Presented at the Sixth AAAI/ACM
+  Conference on AI, Ethics, and Society (AIES, 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Overlooked Implications of the Reconstruction Loss for VAE
+  Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.13341v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.13341v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Michlo, Richard Klein, Steven James
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning disentangled representations with variational autoencoders (VAEs) is
+often attributed to the regularisation component of the loss. In this work, we
+highlight the interaction between data and the reconstruction term of the loss
+as the main contributor to disentanglement in VAEs. We show that standard
+benchmark datasets have unintended correlations between their subjective
+ground-truth factors and perceived axes in the data according to typical VAE
+reconstruction losses. Our work exploits this relationship to provide a theory
+for what constitutes an adversarial dataset under a given reconstruction loss.
+We verify this by constructing an example dataset that prevents disentanglement
+in state-of-the-art frameworks while maintaining human-intuitive ground-truth
+factors. Finally, we re-enable disentanglement by designing an example
+reconstruction loss that is once again able to perceive the ground-truth
+factors. Our findings demonstrate the subjective nature of disentanglement and
+the importance of considering the interaction between the ground-truth factors,
+data and notably, the reconstruction loss, which is under-recognised in the
+literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structure in Reinforcement Learning: A <span class="highlight-title">Survey</span> and Open Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Mohan, Amy Zhang, Marius Lindauer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL), bolstered by the expressive capabilities of Deep
+Neural Networks (DNNs) for function approximation, has demonstrated
+considerable success in numerous applications. However, its practicality in
+addressing various real-world scenarios, characterized by diverse and
+unpredictable dynamics, noisy signals, and large state and action spaces,
+remains limited. This limitation stems from issues such as poor data
+efficiency, limited generalization capabilities, a lack of safety guarantees,
+and the absence of interpretability, among other factors. To overcome these
+challenges and improve performance across these crucial metrics, one promising
+avenue is to incorporate additional structural information about the problem
+into the RL learning process. Various sub-fields of RL have proposed methods
+for incorporating such inductive biases. We amalgamate these diverse
+methodologies under a unified framework, shedding light on the role of
+structure in the learning problem, and classify these methods into distinct
+patterns of incorporating structure. By leveraging this comprehensive
+framework, we provide valuable insights into the challenges of structured RL
+and lay the groundwork for a design pattern perspective on RL research. This
+novel perspective paves the way for future advancements and aids in developing
+more effective and efficient RL algorithms that can potentially handle
+real-world scenarios better.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Emergence of Essential Sparsity in Large <span class="highlight-title">Pre-train</span>ed Models: The
+  Weights that Matter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Jaiswal, Shiwei Liu, Tianlong Chen, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained transformers are show-stealer in modern-day deep learning,
+and it becomes crucial to comprehend the parsimonious patterns that exist
+within them as they grow in scale. With exploding parameter counts, Lottery
+Ticket Hypothesis (LTH) and its variants, have lost their pragmatism in
+sparsifying them due to high computation and memory bottleneck of repetitive
+train-prune-retrain routine of iterative magnitude pruning (IMP) which worsens
+with increasing model size. This paper comprehensively studies induced sparse
+patterns across multiple large pre-trained vision and language transformers. We
+propose the existence of -- essential sparsity defined with a sharp dropping
+point beyond which the performance declines much faster w.r.t the rise of
+sparsity level, when we directly remove weights with the smallest magnitudes in
+one-shot without re-training. We also find essential sparsity to hold valid for
+N:M sparsity patterns as well as on modern-scale large language models
+(Vicuna-7B). We also present an intriguing emerging phenomenon of abrupt
+sparsification during the pre-training of BERT, i.e., BERT suddenly becomes
+heavily sparse in pre-training after certain iterations. Moreover, our
+observations also indicate a counter-intuitive finding that BERT trained with a
+larger amount of pre-training data tends to have a better ability to condense
+knowledge in comparatively relatively fewer parameters. Lastly, we investigate
+the effect of the pre-training loss on essential sparsity and discover that
+self-supervised learning (SSL) objectives trigger stronger emergent
+sparsification properties than supervised learning (SL). Our codes are
+available at \url{https://github.com/VITA-Group/essential_sparsity}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added new results on LLMs and N:M Sparsity</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">survey</span> of some recent developments in measures of association 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.04702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.04702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourav Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper surveys some recent developments in measures of association
+related to a new coefficient of correlation introduced by the author. A
+straightforward extension of this coefficient to standard Borel spaces (which
+includes all Polish spaces), overlooked in the literature so far, is proposed
+at the end of the survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages. Minor changes in this revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Incremental Profit per Conversion: a Response Transformation for Uplift
+  Modeling in E-Commerce Promotions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Manuel Proença, Felipe Moraes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Promotions play a crucial role in e-commerce platforms, and various cost
+structures are employed to drive user engagement. This paper focuses on
+promotions with response-dependent costs, where expenses are incurred only when
+a purchase is made. Such promotions include discounts and coupons. While
+existing uplift model approaches aim to address this challenge, these
+approaches often necessitate training multiple models, like meta-learners, or
+encounter complications when estimating profit due to zero-inflated values
+stemming from non-converted individuals with zero cost and profit.
+  To address these challenges, we introduce Incremental Profit per Conversion
+(IPC), a novel uplift measure of promotional campaigns' efficiency in unit
+economics. Through a proposed response transformation, we demonstrate that IPC
+requires only converted data, its propensity, and a single model to be
+estimated. As a result, IPC resolves the issues mentioned above while
+mitigating the noise typically associated with the class imbalance in
+conversion datasets and biases arising from the many-to-one mapping between
+search and purchase data. Lastly, we validate the efficacy of our approach by
+presenting results obtained from a synthetic simulation of a discount coupon
+campaign.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RALACs: Action Recognition in Autonomous Vehicles using Interaction
+  Encoding and Optical Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eddy Zhou, Alex Zhuang, Alikasim Budhwani, Rowan Dempster, Quanquan Li, Mohammad Al-Sharman, Derek Rayside, William Melek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied to autonomous vehicle (AV) settings, action recognition can
+enhance an environment model's situational awareness. This is especially
+prevalent in scenarios where traditional geometric descriptions and heuristics
+in AVs are insufficient. However, action recognition has traditionally been
+studied for humans, and its limited adaptability to noisy, un-clipped,
+un-pampered, raw RGB data has limited its application in other fields. To push
+for the advancement and adoption of action recognition into AVs, this work
+proposes a novel two-stage action recognition system, termed RALACs. RALACs
+formulates the problem of action recognition for road scenes, and bridges the
+gap between it and the established field of human action recognition. This work
+shows how attention layers can be useful for encoding the relations across
+agents, and stresses how such a scheme can be class-agnostic. Furthermore, to
+address the dynamic nature of agents on the road, RALACs constructs a novel
+approach to adapting Region of Interest (ROI) Alignment to agent tracks for
+downstream action classification. Finally, our scheme also considers the
+problem of active agent detection, and utilizes a novel application of fusing
+optical flow maps to discern relevant agents in a road scene. We show that our
+proposed scheme can outperform the baseline on the ICCV2021 Road Challenge
+dataset and by deploying it on a real vehicle platform, we provide preliminary
+insight to the usefulness of action recognition in decision making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaMask: Revisiting Dimensional Confounder for <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07902v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07902v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangmeng Li, Wenwen Qiang, Yanan Zhang, Wenyi Mo, Changwen Zheng, Bing Su, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a successful approach to self-supervised learning, contrastive learning
+aims to learn invariant information shared among distortions of the input
+sample. While contrastive learning has yielded continuous advancements in
+sampling strategy and architecture design, it still remains two persistent
+defects: the interference of task-irrelevant information and sample
+inefficiency, which are related to the recurring existence of trivial constant
+solutions. From the perspective of dimensional analysis, we find out that the
+dimensional redundancy and dimensional confounder are the intrinsic issues
+behind the phenomena, and provide experimental evidence to support our
+viewpoint. We further propose a simple yet effective approach MetaMask, short
+for the dimensional Mask learned by Meta-learning, to learn representations
+against dimensional redundancy and confounder. MetaMask adopts the
+redundancy-reduction technique to tackle the dimensional redundancy issue and
+innovatively introduces a dimensional mask to reduce the gradient effects of
+specific dimensions containing the confounder, which is trained by employing a
+meta-learning paradigm with the objective of improving the performance of
+masked representations on a typical self-supervised task. We provide solid
+theoretical analyses to prove MetaMask can obtain tighter risk bounds for
+downstream classification compared to typical contrastive methods. Empirically,
+our method achieves state-of-the-art performance on various benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2022 as Spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Separate Anything You Describe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xubo Liu, Qiuqiang Kong, Yan Zhao, Haohe Liu, Yi Yuan, Yuzhuo Liu, Rui Xia, Yuxuan Wang, Mark D. Plumbley, Wenwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language-queried audio source separation (LASS) is a new paradigm for
+computational auditory scene analysis (CASA). LASS aims to separate a target
+sound from an audio mixture given a natural language query, which provides a
+natural and scalable interface for digital audio applications. Recent works on
+LASS, despite attaining promising separation performance on specific sources
+(e.g., musical instruments, limited classes of audio events), are unable to
+separate audio concepts in the open domain. In this work, we introduce
+AudioSep, a foundation model for open-domain audio source separation with
+natural language queries. We train AudioSep on large-scale multimodal datasets
+and extensively evaluate its capabilities on numerous tasks including audio
+event separation, musical instrument separation, and speech enhancement.
+AudioSep demonstrates strong separation performance and impressive zero-shot
+generalization ability using audio captions or text labels as queries,
+substantially outperforming previous audio-queried and language-queried sound
+separation models. For reproducibility of this work, we will release the source
+code, evaluation benchmark and pre-trained model at:
+https://github.com/Audio-AGI/AudioSep.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://audio-agi.github.io/Separate-Anything-You-Describe; Code:
+  https://github.com/Audio-AGI/AudioSep</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Induction Network: Audio-Visual Modality Gap-Bridging for
+  <span class="highlight-title">Self-Supervised</span> Sound Source Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Liu, Peng Zhang, Wei Huang, Yufei Zha, Tao You, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised sound source localization is usually challenged by the
+modality inconsistency. In recent studies, contrastive learning based
+strategies have shown promising to establish such a consistent correspondence
+between audio and sound sources in visual scenarios. Unfortunately, the
+insufficient attention to the heterogeneity influence in the different modality
+features still limits this scheme to be further improved, which also becomes
+the motivation of our work. In this study, an Induction Network is proposed to
+bridge the modality gap more effectively. By decoupling the gradients of visual
+and audio modalities, the discriminative visual representations of sound
+sources can be learned with the designed Induction Vector in a bootstrap
+manner, which also enables the audio modality to be aligned with the visual
+modality consistently. In addition to a visual weighted contrastive loss, an
+adaptive threshold selection strategy is introduced to enhance the robustness
+of the Induction Network. Substantial experiments conducted on SoundNet-Flickr
+and VGG-Sound Source datasets have demonstrated a superior performance compared
+to other state-of-the-art works in different challenging scenarios. The code is
+available at https://github.com/Tahy1/AVIN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JEN-1: Text-Guided Universal Music Generation with Omnidirectional
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peike Li, Boyu Chen, Yao Yao, Yikai Wang, Allen Wang, Alex Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music generation has attracted growing interest with the advancement of deep
+generative models. However, generating music conditioned on textual
+descriptions, known as text-to-music, remains challenging due to the complexity
+of musical structures and high sampling rate requirements. Despite the task's
+significance, prevailing generative models exhibit limitations in music
+quality, computational efficiency, and generalization. This paper introduces
+JEN-1, a universal high-fidelity model for text-to-music generation. JEN-1 is a
+diffusion model incorporating both autoregressive and non-autoregressive
+training. Through in-context learning, JEN-1 performs various generation tasks
+including text-guided music generation, music inpainting, and continuation.
+Evaluations demonstrate JEN-1's superior performance over state-of-the-art
+methods in text-music alignment and music quality while maintaining
+computational efficiency. Our demos are available at
+http://futureverse.com/research/jen/demos/jen1
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Forensic Methodology for Detecting Image Manipulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwon Lee, Seungjae Jeon, Yunji Park, Jaehyun Chung, Doowon Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By applying artificial intelligence to image editing technology, it has
+become possible to generate high-quality images with minimal traces of
+manipulation. However, since these technologies can be misused for criminal
+activities such as dissemination of false information, destruction of evidence,
+and denial of facts, it is crucial to implement strong countermeasures. In this
+study, image file and mobile forensic artifacts analysis were conducted for
+detecting image manipulation. Image file analysis involves parsing the metadata
+of manipulated images (e.g., Exif, DQT, and Filename Signature) and comparing
+them with a Reference DB to detect manipulation. The Reference DB is a database
+that collects manipulation-related traces left in image metadata, which serves
+as a criterion for detecting image manipulation. In the mobile forensic
+artifacts analysis, packages related to image editing tools were extracted and
+analyzed to aid the detection of image manipulation. The proposed methodology
+overcomes the limitations of existing graphic feature-based analysis and
+combines with image processing techniques, providing the advantage of reducing
+false positives. The research results demonstrate the significant role of such
+methodology in digital forensic investigation and analysis. Additionally, We
+provide the code for parsing image metadata and the Reference DB along with the
+dataset of manipulated images, aiming to contribute to related research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resource Constrained Model Compression via Minimax Optimization for
+  Spiking Neural Networks <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jue Chen, Huan Yuan, Jianchao Tan, Bin Chen, Chengru Song, Di Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain-inspired Spiking Neural Networks (SNNs) have the characteristics of
+event-driven and high energy-efficient, which are different from traditional
+Artificial Neural Networks (ANNs) when deployed on edge devices such as
+neuromorphic chips. Most previous work focuses on SNNs training strategies to
+improve model performance and brings larger and deeper network architectures.
+It is difficult to deploy these complex networks on resource-limited edge
+devices directly. To meet such demand, people compress SNNs very cautiously to
+balance the performance and the computation efficiency. Existing compression
+methods either iteratively pruned SNNs using weights norm magnitude or
+formulated the problem as a sparse learning optimization. We propose an
+improved end-to-end Minimax optimization method for this sparse learning
+problem to better balance the model performance and the computation efficiency.
+We also demonstrate that jointly applying compression and finetuning on SNNs is
+better than sequentially, especially for extreme compression ratios. The
+compressed SNN models achieved state-of-the-art (SOTA) performance on various
+benchmark datasets and architectures. Our code is available at
+https://github.com/chenjallen/Resource-Constrained-Compression-on-SNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Adaptive Real-Time Video Communication Via Cross-layer
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueheng Li, Hao Chen, Bowei Xu, Zicheng Zhang, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective Adaptive BitRate (ABR) algorithm or policy is of paramount
+importance for Real-Time Video Communication (RTVC) amid this pandemic to
+pursue uncompromised quality of experience (QoE). Existing ABR methods mainly
+separate the network bandwidth estimation and video encoder control, and
+fine-tune video bitrate towards estimated bandwidth, assuming the maximization
+of bandwidth utilization yields the optimal QoE. However, the QoE of a RTVC
+system is jointly determined by the quality of compressed video, fluency of
+video playback, and interaction delay. Solely maximizing the bandwidth
+utilization without comprehensively considering compound impacts incurred by
+both network and video application layers, does not assure the satisfactory
+QoE. And the decoupling of network and video layer further exacerbates the user
+experience due to network-codec incoordination. This work therefore proposes
+the Palette, a reinforcement learning based ABR scheme that unifies the
+processing of network and video application layers to directly maximize the QoE
+formulated as the weighted function of video quality, stalling rate and delay.
+To this aim, a cross-layer optimization is proposed to derive fine-grained
+compression factor of upcoming frame(s) using cross-layer observations like
+network conditions, video encoding parameters, and video content complexity. As
+a result, Palette manages to resolve the network-codec incoordination and to
+best catch up with the network fluctuation. Compared with state-of-the-art
+schemes in real-world tests, Palette not only reduces 3.1%-46.3% of the
+stalling rate, 20.2%-50.8% of the delay, but also improves 0.2%-7.2% of the
+video quality with comparable bandwidth consumption, under a variety of
+application scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Yanni Hu, Yuguang Yang, Jixun Yao, Wen Fei, Lei Ma, Heng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive cross-modality pretraining approaches have recently exhibited
+impressive success in diverse fields. In this paper, we propose GEmo-CLAP, a
+kind of gender-attribute-enhanced contrastive language-audio pretraining (CLAP)
+method for speech emotion recognition (SER).Specifically, an effective emotion
+CLAP model (Emo-CLAP) is first built, using various self-supervised pre-trained
+models for SER. Second, given the significance of the gender attribute in
+speech emotion modeling, two novel soft label based GEmo-CLAP (SL-GEmo-CLAP)
+and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP) are further proposed to
+incorporate gender information of speech signals, forming more reasonable
+objectives. Experiments on IEMOCAP demonstrate that our proposed two GEmo-CLAPs
+consistently outperform the baseline Emo-CLAP with various pre-trained models,
+while also achieving the best recognition performance compared with
+state-of-the-art SER methods. Remarkably, the proposed WavLM-based SL-GEmo-CLAP
+model achieves the best UAR of 81.43\% and WAR of 83.16\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-08T00:00:00Z">2023-08-08</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">56</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sewon Min, Suchin Gururangan, Eric Wallace, Hannaneh Hajishirzi, Noah A. Smith, Luke Zettlemoyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The legality of training language models (LMs) on copyrighted or otherwise
+restricted data is under intense debate. However, as we show, model performance
+significantly degrades if trained only on low-risk text (e.g., out-of-copyright
+books or government documents), due to its limited size and domain coverage. We
+present SILO, a new language model that manages this risk-performance tradeoff
+during inference. SILO is built by (1) training a parametric LM on Open License
+Corpus (OLC), a new corpus we curate with 228B tokens of public domain and
+permissively licensed text and (2) augmenting it with a more general and easily
+modifiable nonparametric datastore (e.g., containing copyrighted books or news)
+that is only queried during inference. The datastore allows use of high-risk
+data without training on it, supports sentence-level data attribution, and
+enables data producers to opt out from the model by removing content from the
+store. These capabilities can foster compliance with data-use regulations such
+as the fair use doctrine in the United States and the GDPR in the European
+Union. Our experiments show that the parametric LM struggles on domains not
+covered by OLC. However, access to the datastore greatly improves out of domain
+performance, closing 90% of the performance gap with an LM trained on the Pile,
+a more diverse corpus with mostly high-risk text. We also analyze which
+nonparametric approach works best, where the remaining errors lie, and how
+performance scales with datastore size. Our results suggest that it is possible
+to build high quality language models while mitigating their legal risk.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages; 6 figures. Code, models, and data available at
+  https://github.com/kernelmachine/silo-lm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment
+  Classification and Act Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zheng, Fei Li, Yuyang Chai, Chong Teng, Donghong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition
+(DAR) aims to predict the sentiment label and act label for each utterance in a
+dialog simultaneously. However, current methods encode the dialog context in
+only one direction, which limits their ability to thoroughly comprehend the
+context. Moreover, these methods overlook the explicit correlations between
+sentiment and act labels, which leads to an insufficient ability to capture
+rich sentiment and act clues and hinders effective and accurate reasoning. To
+address these issues, we propose a Bi-directional Multi-hop Inference Model
+(BMIM) that leverages a feature selection network and a bi-directional
+multi-hop inference network to iteratively extract and integrate rich sentiment
+and act clues in a bi-directional manner. We also employ contrastive learning
+and dual learning to explicitly model the correlations of sentiment and act
+labels. Our experiments on two widely-used datasets show that BMIM outperforms
+state-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1
+score in DSC. Additionally, Our proposed model not only improves the
+performance but also enhances the interpretability of the joint sentiment and
+act prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Character-level NMT and language similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Jon, Ondřej Bojar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the effectiveness of character-level neural machine translation
+using Transformer architecture for various levels of language similarity and
+size of the training dataset on translation between Czech and Croatian, German,
+Hungarian, Slovak, and Spanish. We evaluate the models using automatic MT
+metrics and show that translation between similar languages benefits from
+character-level input segmentation, while for less related languages,
+character-level vanilla Transformer-base often lags behind subword-level
+segmentation. We confirm previous findings that it is possible to close the gap
+by finetuning the already trained subword-level models to character-level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Evaluation Models from Large Language Models for Sequence
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenglong Wang, Hang Zhou, Kaiyan Chang, Tongran Liu, Chunliang Zhang, Quan Du, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models achieve state-of-the-art performance on sequence
+generation evaluation, but typically have a large number of parameters. This is
+a computational challenge as presented by applying their evaluation capability
+at scale. To overcome the challenge, in this paper, we propose \textbf{ECT}, an
+\textbf{e}valuation \textbf{c}apability \textbf{t}ransfer method, to transfer
+the evaluation capability from LLMs to relatively lightweight language models.
+Based on the proposed ECT, we learn various evaluation models from ChatGPT, and
+employ them as reward models to improve sequence generation models via
+reinforcement learning and reranking approaches. Experimental results on
+machine translation, text style transfer, and summarization tasks demonstrate
+the effectiveness of our ECT. Notably, applying the learned evaluation models
+to sequence generation models results in better generated sequences as
+evaluated by commonly used metrics and ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking Nationality Bias: A Study of Human Perception of Nationalities
+  in AI-Generated Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Narayanan Venkit, Sanjana Gautam, Ruchi Panchanadikar, Ting-Hao `Kenneth' Huang, Shomir Wilson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the potential for nationality biases in natural language
+processing (NLP) models using human evaluation methods. Biased NLP models can
+perpetuate stereotypes and lead to algorithmic discrimination, posing a
+significant challenge to the fairness and justice of AI systems. Our study
+employs a two-step mixed-methods approach that includes both quantitative and
+qualitative analysis to identify and understand the impact of nationality bias
+in a text generation model. Through our human-centered quantitative analysis,
+we measure the extent of nationality bias in articles generated by AI sources.
+We then conduct open-ended interviews with participants, performing qualitative
+coding and thematic analysis to understand the implications of these biases on
+human readers. Our findings reveal that biased NLP models tend to replicate and
+amplify existing societal biases, which can translate to harm if used in a
+sociotechnical setting. The qualitative analysis from our interviews offers
+insights into the experience readers have when encountering such articles,
+highlighting the potential to shift a reader's perception of a country. These
+findings emphasize the critical role of public perception in shaping AI's
+impact on society and the need to correct biases in AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an AI to Win Ghana's National Science and Maths Quiz <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Boateng, Jonathan Abrefah Mensah, Kevin Takyi Yeboah, William Edor, Andrew Kojo Mensah-Onumah, Naafi Dasana Ibrahim, Nana Sam Yeboah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can an AI win Ghana's National Science and Maths Quiz (NSMQ)? That is the
+question we seek to answer in the NSMQ AI project, an open-source project that
+is building AI to compete live in the NSMQ and win. The NSMQ is an annual live
+science and mathematics competition for senior secondary school students in
+Ghana in which 3 teams of 2 students compete by answering questions across
+biology, chemistry, physics, and math in 5 rounds over 5 progressive stages
+until a winning team is crowned for that year. The NSMQ is an exciting live
+quiz competition with interesting technical challenges across speech-to-text,
+text-to-speech, question-answering, and human-computer interaction. In this
+ongoing work that began in January 2023, we give an overview of the project,
+describe each of the teams, progress made thus far, and the next steps toward
+our planned launch and debut of the AI in October for NSMQ 2023. An AI that
+conquers this grand challenge can have real-world impact on education such as
+enabling millions of students across Africa to have one-on-one learning support
+from this AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages. Under review at Deep Learning Indaba and Black in AI
+  Workshop @NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-Based Knowledge Injection for Metaphor Detection: A
+  Comprehensive <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Yang, Wenye Zhao, Qingbao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The history of metaphor research also marks the evolution of knowledge
+infusion research. With the continued advancement of deep learning techniques
+in recent years, the natural language processing community has shown great
+interest in applying knowledge to successful results in metaphor recognition
+tasks. Although there has been a gradual increase in the number of approaches
+involving knowledge injection in the field of metaphor recognition, there is a
+lack of a complete review article on knowledge injection based approaches.
+Therefore, the goal of this paper is to provide a comprehensive review of
+research advances in the application of deep learning for knowledge injection
+in metaphor recognition tasks. In this paper, we systematically summarize and
+generalize the mainstream knowledge and knowledge injection principles, as well
+as review the datasets, evaluation metrics, and benchmark models used in
+metaphor recognition tasks. Finally, we explore the current issues facing
+knowledge injection methods and provide an outlook on future research
+directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of the wav2vec 2.0 Feature Extractor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Vieting, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems typically use handcrafted feature
+extraction pipelines. To avoid their inherent information loss and to achieve
+more consistent modeling from speech to transcribed text, neural raw waveform
+feature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,
+which has recently gained large popularity, uses a convolutional FE which
+operates directly on the speech waveform. However, it is not yet studied
+extensively in the literature. In this work, we study its capability to replace
+the standard feature extraction methods in a connectionist temporal
+classification (CTC) ASR model and compare it to an alternative neural FE. We
+show that both are competitive with traditional FEs on the LibriSpeech
+benchmark and analyze the effect of the individual components. Furthermore, we
+analyze the learned filters and show that the most important information for
+the ASR system is obtained by a set of bandpass filters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ITG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Alignment: Chat with Vanilla Language Models Before
+  Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we explore inference-time alignment through in-context
+learning. We consider a vanilla pretrained language model Llama-2 before any
+fine-tuning and retrieve an average of 9 demonstration alignment examples when
+the model is prompted to follow chat-style instructions. Compared to direct
+prompting, the in-context alignment without changing model weights leads to a
+7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making
+the vanilla language model comparable to strong baselines with alignment
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic
+  Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luka Terčon, Nikola Ljubešić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of
+the South Slavic languages, which is based on the Stanza natural language
+processing pipeline. We describe the main improvements in CLASSLA-Stanza with
+respect to Stanza, and give a detailed description of the model training
+process for the latest 2.1 release of the pipeline. We also report performance
+scores produced by the pipeline for different languages and varieties.
+CLASSLA-Stanza exhibits consistently high performance across all the supported
+languages and outperforms or expands its parent pipeline Stanza at all the
+supported tasks. We also present the pipeline's new functionality enabling
+efficient processing of web data and the reasons that led to its
+implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 tables, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gloss Alignment Using Word Embeddings <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Walsh, Ozge Mercanoglu Sincan, Ben Saunders, Richard Bowden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing and annotating Sign language datasets is a time consuming and
+costly process. Current datasets are orders of magnitude too small to
+successfully train unconstrained \acf{slt} models. As a result, research has
+turned to TV broadcast content as a source of large-scale training data,
+consisting of both the sign language interpreter and the associated audio
+subtitle. However, lack of sign language annotation limits the usability of
+this data and has led to the development of automatic annotation techniques
+such as sign spotting. These spottings are aligned to the video rather than the
+subtitle, which often results in a misalignment between the subtitle and
+spotted signs. In this paper we propose a method for aligning spottings with
+their corresponding subtitles using large spoken language models. Using a
+single modality means our method is computationally inexpensive and can be
+utilized in conjunction with existing alignment techniques. We quantitatively
+demonstrate the effectiveness of our method on the \acf{mdgs} and \acf{bobsl}
+datasets, recovering up to a 33.22 BLEU-1 score in word alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures, 2023 IEEE International Conference on Acoustics,
+  Speech, and Signal Processing Workshops (ICASSPW)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Retrieval-Augmented Generation for Real-time Composition
+  Assistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuchao Zhang, Menglin Xia, Camille Couturier, Guoqing Zheng, Saravan Rajmohan, Victor Ruhle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval augmented models show promise in enhancing traditional language
+models by improving their contextual understanding, integrating private data,
+and reducing hallucination. However, the processing time required for retrieval
+augmented large language models poses a challenge when applying them to tasks
+that require real-time responses, such as composition assistance.
+  To overcome this limitation, we propose the Hybrid Retrieval-Augmented
+Generation (HybridRAG) framework that leverages a hybrid setting that combines
+both client and cloud models. HybridRAG incorporates retrieval-augmented memory
+generated asynchronously by a Large Language Model (LLM) in the cloud. By
+integrating this retrieval augmented memory, the client model acquires the
+capability to generate highly effective responses, benefiting from the LLM's
+capabilities. Furthermore, through asynchronous memory integration, the client
+model is capable of delivering real-time responses to user requests without the
+need to wait for memory synchronization from the cloud. Our experiments on
+Wikitext and Pile subsets show that HybridRAG achieves lower latency than a
+cloud-based retrieval-augmented LLM, while outperforming client-only models in
+utility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Socially Unacceptable Discourse Classification (SUD) through
+  different eyes: "Are we on the same page ?" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Machado Carneiro, Michele Linardi, Julien Longhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Socially Unacceptable Discourse (SUD) characterization and detection
+in online text. We first build and present a novel corpus that contains a large
+variety of manually annotated texts from different online sources used so far
+in state-of-the-art Machine learning (ML) SUD detection solutions. This global
+context allows us to test the generalization ability of SUD classifiers that
+acquire knowledge around the same SUD categories, but from different contexts.
+From this perspective, we can analyze how (possibly) different annotation
+modalities influence SUD learning by discussing open challenges and open
+research directions. We also provide several data insights which can support
+domain experts in the annotation task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Monotonic Aggregation for Open-domain QA <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sang-eun Han, Yeonseok Jeong, Seung-won Hwang, Kyungjae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering (QA) is a critical task for speech-based retrieval from
+knowledge sources, by sifting only the answers without requiring to read
+supporting documents. Specifically, open-domain QA aims to answer user
+questions on unrestricted knowledge sources. Ideally, adding a source should
+not decrease the accuracy, but we find this property (denoted as
+"monotonicity") does not hold for current state-of-the-art methods. We identify
+the cause, and based on that we propose Judge-Specialist framework. Our
+framework consists of (1) specialist retrievers/readers to cover individual
+sources, and (2) judge, a dedicated language model to select the final answer.
+Our experiments show that our framework not only ensures monotonicity, but also
+outperforms state-of-the-art multi-source QA methods on Natural Questions.
+Additionally, we show that our models robustly preserve the monotonicity
+against noise from speech recognition. We publicly release our code and
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model <span class="highlight-title">Prompt</span> Chaining for Long Legal Document
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dietrich Trautmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompting is used to guide or steer a language model in generating an
+appropriate response that is consistent with the desired outcome. Chaining is a
+strategy used to decompose complex tasks into smaller, manageable components.
+In this study, we utilize prompt chaining for extensive legal document
+classification tasks, which present difficulties due to their intricate
+domain-specific language and considerable length. Our approach begins with the
+creation of a concise summary of the original document, followed by a semantic
+search for related exemplar texts and their corresponding annotations from a
+training corpus. Finally, we prompt for a label - based on the task - to
+assign, by leveraging the in-context learning from the few-shot prompt. We
+demonstrate that through prompt chaining, we can not only enhance the
+performance over zero-shot, but also surpass the micro-F1 score achieved by
+larger models, such as ChatGPT zero-shot, using smaller models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SwissText 2023 Late Breaking Work (Generative AI & LLM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Social Media, Topic Modeling and Sentiment Analysis in Municipal
+  Decision Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Švaňa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many cities around the world are aspiring to become. However, smart
+initiatives often give little weight to the opinions of average citizens.
+  Social media are one of the most important sources of citizen opinions. This
+paper presents a prototype of a framework for processing social media posts
+with municipal decision-making in mind. The framework consists of a sequence of
+three steps: (1) determining the sentiment polarity of each social media post
+(2) identifying prevalent topics and mapping these topics to individual posts,
+and (3) aggregating these two pieces of information into a fuzzy number
+representing the overall sentiment expressed towards each topic. Optionally,
+the fuzzy number can be reduced into a tuple of two real numbers indicating the
+"amount" of positive and negative opinion expressed towards each topic.
+  The framework is demonstrated on tweets published from Ostrava, Czechia over
+a period of about two months. This application illustrates how fuzzy numbers
+represent sentiment in a richer way and capture the diversity of opinions
+expressed on social media.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collective Human Opinions in Semantic Textual Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxia Wang, Shimin Tao, Ning Xie, Hao Yang, Timothy Baldwin, Karin Verspoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the subjective nature of semantic textual similarity (STS) and
+pervasive disagreements in STS annotation, existing benchmarks have used
+averaged human ratings as the gold standard. Averaging masks the true
+distribution of human opinions on examples of low agreement, and prevents
+models from capturing the semantic vagueness that the individual ratings
+represent. In this work, we introduce USTS, the first Uncertainty-aware STS
+dataset with ~15,000 Chinese sentence pairs and 150,000 labels, to study
+collective human opinions in STS. Analysis reveals that neither a scalar nor a
+single Gaussian fits a set of observed judgements adequately. We further show
+that current STS models cannot capture the variance caused by human
+disagreement on individual instances, but rather reflect the predictive
+confidence over the aggregate dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I-WAS: a Data Augmentation Method with <span class="highlight-title">GPT</span>-2 for Simile Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongzhu Chang, Rongsheng Zhang, Jiashu Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simile detection is a valuable task for many natural language processing
+(NLP)-based applications, particularly in the field of literature. However,
+existing research on simile detection often relies on corpora that are limited
+in size and do not adequately represent the full range of simile forms. To
+address this issue, we propose a simile data augmentation method based on
+\textbf{W}ord replacement And Sentence completion using the GPT-2 language
+model. Our iterative process called I-WAS, is designed to improve the quality
+of the augmented sentences. To better evaluate the performance of our method in
+real-world applications, we have compiled a corpus containing a more diverse
+set of simile forms for experimentation. Our experimental results demonstrate
+the effectiveness of our proposed data augmentation method for simile
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DataTales: Investigating the use of Large Language Models for Authoring
+  Data-Driven Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicole Sultanum, Arjun Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Authoring data-driven articles is a complex process requiring authors to not
+only analyze data for insights but also craft a cohesive narrative that
+effectively communicates the insights. Text generation capabilities of
+contemporary large language models (LLMs) present an opportunity to assist the
+authoring of data-driven articles and expedite the writing process. In this
+work, we investigate the feasibility and perceived value of leveraging LLMs to
+support authors of data-driven articles. We designed a prototype system,
+DataTales, that leverages a LLM to generate textual narratives accompanying a
+given chart. Using DataTales as a design probe, we conducted a qualitative
+study with 11 professionals to evaluate the concept, from which we distilled
+affordances and opportunities to further integrate LLMs as valuable data-driven
+article authoring assistants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfeRE: Step-by-Step Regex Generation via Chain of Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Zhang, Xiaodong Gu, Yuting Chen, Beijun Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically generating regular expressions (abbrev. regexes) from natural
+language description (NL2RE) has been an emerging research area. Prior studies
+treat regex as a linear sequence of tokens and generate the final expressions
+autoregressively in a single pass. They did not take into account the
+step-by-step internal text-matching processes behind the final results. This
+significantly hinders the efficacy and interpretability of regex generation by
+neural language models. In this paper, we propose a new paradigm called InfeRE,
+which decomposes the generation of regexes into chains of step-by-step
+inference. To enhance the robustness, we introduce a self-consistency decoding
+mechanism that ensembles multiple outputs sampled from different models. We
+evaluate InfeRE on two publicly available datasets, NL-RX-Turk and KB13, and
+compare the results with state-of-the-art approaches and the popular tree-based
+generation approach TRANX. Experimental results show that InfeRE substantially
+outperforms previous baselines, yielding 16.3% and 14.7% improvement in DFA@5
+accuracy on two datasets, respectively. Particularly, InfeRE outperforms the
+popular tree-based generation approach by 18.1% and 11.3% on both datasets,
+respectively, in terms of DFA@5 accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ASE'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study on TF-IDF feature Weighting Method and its Analysis
+  using Unstructured <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamata Das, Selvakumar K., P. J. A. Alphonse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Classification is the process of categorizing text into the relevant
+categories and its algorithms are at the core of many Natural Language
+Processing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP
+are the most highly used information retrieval methods in text classification.
+We have investigated and analyzed the feature weighting method for text
+classification on unstructured data. The proposed model considered two features
+N-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset
+for sentiment analysis. Then we have used the state-of-the-art classifier to
+validate the method i.e., Support Vector Machine (SVM), Logistic Regression,
+Multinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and
+k-nearest neighbors (KNN). From those two feature extractions, a significant
+increase in feature extraction with TF-IDF features rather than based on
+N-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall
+(93.81%), and F1-score (91.99%) value in Random Forest classifier.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, COLINS-2021, 5th International Conference on
+  Computational Linguistics and Intelligent Systems, April 22-23, 2021,
+  Kharkiv, Ukraine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Pre-Train</span>ing of Large Language Models: How to (re)warm your
+  model? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats L. Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, Timothée Lesort
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are routinely pre-trained on billions of tokens,
+only to restart the process over again once new data becomes available. A much
+cheaper and more efficient solution would be to enable the continual
+pre-training of these models, i.e. updating pre-trained models with new data
+instead of re-training them from scratch. However, the distribution shift
+induced by novel data typically results in degraded performance on past data.
+Taking a step towards efficient continual pre-training, in this work, we
+examine the effect of different warm-up strategies. Our hypothesis is that the
+learning rate must be re-increased to improve compute efficiency when training
+on a new dataset. We study the warmup phase of models pre-trained on the Pile
+(upstream data, 300B tokens) as we continue to pre-train on SlimPajama
+(downstream data, 297B tokens), following a linear warmup and cosine decay
+schedule. We conduct all experiments on the Pythia 410M language model
+architecture and evaluate performance through validation perplexity. We
+experiment with different pre-training checkpoints, various maximum learning
+rates, and various warmup lengths. Our results show that while rewarming models
+first increases the loss on upstream and downstream data, in the longer run it
+improves the downstream performance, outperforming models trained from
+scratch$\unicode{x2013}$even for a large downstream dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimplyRetrieve: A Private and Lightweight Retrieval-Centric Generative
+  AI Tool 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youyang Ng, Daisuke Miyashita, Yasuto Hoshi, Yasuhiro Morioka, Osamu Torii, Tomoya Kodama, Jun Deguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Model (LLM) based Generative AI systems have seen significant
+progress in recent years. Integrating a knowledge retrieval architecture allows
+for seamless integration of private data into publicly available Generative AI
+systems using pre-trained LLM without requiring additional model fine-tuning.
+Moreover, Retrieval-Centric Generation (RCG) approach, a promising future
+research direction that explicitly separates roles of LLMs and retrievers in
+context interpretation and knowledge memorization, potentially leads to more
+efficient implementation. SimplyRetrieve is an open-source tool with the goal
+of providing a localized, lightweight, and user-friendly interface to these
+sophisticated advancements to the machine learning community. SimplyRetrieve
+features a GUI and API based RCG platform, assisted by a Private Knowledge Base
+Constructor and a Retrieval Tuning Module. By leveraging these capabilities,
+users can explore the potential of RCG for improving generative AI performance
+while maintaining privacy standards. The tool is available at
+https://github.com/RCGAI/SimplyRetrieve with an MIT license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Sentence Embedding Models for Assessing Semantic
+  Variation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deven M. Mistry, Ali A. Minai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the pattern of semantic variation in long real-world texts such as
+books or transcripts is interesting from the stylistic, cognitive, and
+linguistic perspectives. It is also useful for applications such as text
+segmentation, document summarization, and detection of semantic novelty. The
+recent emergence of several vector-space methods for sentence embedding has
+made such analysis feasible. However, this raises the issue of how consistent
+and meaningful the semantic representations produced by various methods are in
+themselves. In this paper, we compare several recent sentence embedding methods
+via time-series of semantic similarity between successive sentences and
+matrices of pairwise sentence similarity for multiple books of literature. In
+contrast to previous work using target tasks and curated datasets to compare
+sentence embedding methods, our approach provides an evaluation of the methods
+'in the wild'. We find that most of the sentence embedding methods considered
+do infer highly correlated patterns of semantic similarity in a given document,
+but show interesting differences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, Accepted for publication in the Proceedings of
+  the 2023 International Conference on Artificial Neural Networks, Heraklion,,
+  Greece, September 26-29, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking LLM powered Chatbots: Methods and Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debarag Banerjee, Pooja Singh, Arjun Avadhanam, Saksham Srivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous conversational agents, i.e. chatbots, are becoming an increasingly
+common mechanism for enterprises to provide support to customers and partners.
+In order to rate chatbots, especially ones powered by Generative AI tools like
+Large Language Models (LLMs) we need to be able to accurately assess their
+performance. This is where chatbot benchmarking becomes important. In this
+paper, we propose the use of a novel benchmark that we call the E2E (End to
+End) benchmark, and show how the E2E benchmark can be used to evaluate accuracy
+and usefulness of the answers provided by chatbots, especially ones powered by
+LLMs. We evaluate an example chatbot at different levels of sophistication
+based on both our E2E benchmark, as well as other available metrics commonly
+used in the state of art, and observe that the proposed benchmark show better
+results compared to others. In addition, while some metrics proved to be
+unpredictable, the metric associated with the E2E benchmark, which uses cosine
+similarity performed well in evaluating chatbots. The performance of our best
+models shows that there are several benefits of using the cosine similarity
+score as a metric in the E2E benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating LLM Inference with Staged Speculative Decoding <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Spector, Chris Re
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances with large language models (LLM) illustrate their diverse
+capabilities. We propose a novel algorithm, staged speculative decoding, to
+accelerate LLM inference in small-batch, on-device scenarios. We address the
+low arithmetic intensity of small-batch inference by improving upon previous
+work in speculative decoding. First, we restructure the speculative batch as a
+tree, which reduces generation costs and increases the expected tokens per
+batch. Second, we add a second stage of speculative decoding. Taken together,
+we reduce single-batch decoding latency by 3.16x with a 762M parameter GPT-2-L
+model while perfectly preserving output quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ES-FOMO at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shepherd: A Critic for Language Model Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlu Wang, Ping Yu, Xiaoqing Ellen Tan, Sean O'Brien, Ramakanth Pasunuru, Jane Dwivedi-Yu, Olga Golovneva, Luke Zettlemoyer, Maryam Fazel-Zarandi, Asli Celikyilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models improve, there is increasing interest in techniques
+that leverage these models' capabilities to refine their own outputs. In this
+work, we introduce Shepherd, a language model specifically tuned to critique
+responses and suggest refinements, extending beyond the capabilities of an
+untuned model to identify diverse errors and provide suggestions to remedy
+them. At the core of our approach is a high quality feedback dataset, which we
+curate from community feedback and human annotations. Even though Shepherd is
+small (7B parameters), its critiques are either equivalent or preferred to
+those from established models including ChatGPT. Using GPT-4 for evaluation,
+Shepherd reaches an average win-rate of 53-87% compared to competitive
+alternatives. In human evaluation, Shepherd strictly outperforms other models
+and on average closely ties with ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Single-Sentence Reader: A Novel Approach for Addressing Answer Position
+  Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Son Quoc Tran, Matt Kretchmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Reading Comprehension (MRC) models tend to take advantage of spurious
+correlations (also known as dataset bias or annotation artifacts in the
+research community). Consequently, these models may perform the MRC task
+without fully comprehending the given context and question, which is
+undesirable since it may result in low robustness against distribution shift.
+This paper delves into the concept of answer-position bias, where a significant
+percentage of training questions have answers located solely in the first
+sentence of the context. We propose a Single-Sentence Reader as a new approach
+for addressing answer position bias in MRC. We implement this approach using
+six different models and thoroughly analyze their performance. Remarkably, our
+proposed Single-Sentence Readers achieve results that nearly match those of
+models trained on conventional training sets, proving their effectiveness. Our
+study also discusses several challenges our Single-Sentence Readers encounter
+and proposes a potential solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 tables, 2 figures. arXiv admin note: text overlap with
+  arXiv:2211.16220 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ahead of the Text: Leveraging Entity Preposition for Financial Relation
+  Extraction <span class="chip">SIGIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Pasch, Dimitrios Petridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of the ACM KDF-SIGIR 2023 competition, we undertook an entity
+relation task on a dataset of financial entity relations called REFind. Our
+top-performing solution involved a multi-step approach. Initially, we inserted
+the provided entities at their corresponding locations within the text.
+Subsequently, we fine-tuned the transformer-based language model roberta-large
+for text classification by utilizing a labeled training set to predict the
+entity relations. Lastly, we implemented a post-processing phase to identify
+and handle improbable predictions generated by the model. As a result of our
+methodology, we achieved the 1st place ranking on the competition's public
+leaderboard.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Stefan Pasch, Dimitrios Petridis 2023. Ahead of the Text: Leveraging
+  Entity Preposition for Financial Relation Extraction. ACM SIGIR: The 4th
+  Workshop on Knowledge Discovery from Unstructured Data in Financial Services
+  (SIGIR-KDF '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisCoCat for Donkey Sentences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lachlan McPheat, Daphne Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how to parse Geach's Donkey sentences in a compositional
+distributional model of meaning. We build on previous work on the DisCoCat
+(Distributional Compositional Categorical) framework, including extensions that
+model discourse, determiners, and relative pronouns. We present a type-logical
+syntax for parsing donkey sentences, for which we define both relational and
+vector space semantics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings AMSLO 2023, arXiv:2308.03679</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Disentanglement and Fusion on Modality and Context in
+  Conversational Multimodal Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bobo Li, Hao Fei, Lizi Liao, Yu Zhao, Chong Teng, Tat-Seng Chua, Donghong Ji, Fei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been a hot research topic to enable machines to understand human
+emotions in multimodal contexts under dialogue scenarios, which is tasked with
+multimodal emotion analysis in conversation (MM-ERC). MM-ERC has received
+consistent attention in recent years, where a diverse range of methods has been
+proposed for securing better task performance. Most existing works treat MM-ERC
+as a standard multimodal classification problem and perform multimodal feature
+disentanglement and fusion for maximizing feature utility. Yet after revisiting
+the characteristic of MM-ERC, we argue that both the feature multimodality and
+conversational contextualization should be properly modeled simultaneously
+during the feature disentanglement and fusion steps. In this work, we target
+further pushing the task performance by taking full consideration of the above
+insights. On the one hand, during feature disentanglement, based on the
+contrastive learning technique, we devise a Dual-level Disentanglement
+Mechanism (DDM) to decouple the features into both the modality space and
+utterance space. On the other hand, during the feature fusion stage, we propose
+a Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism
+(CRM) for multimodal and context integration, respectively. They together
+schedule the proper integrations of multimodal and context features.
+Specifically, CFM explicitly manages the multimodal feature contributions
+dynamically, while CRM flexibly coordinates the introduction of dialogue
+contexts. On two public MM-ERC datasets, our system achieves new
+state-of-the-art performance consistently. Further analyses demonstrate that
+all our proposed mechanisms greatly facilitate the MM-ERC task by making full
+use of the multimodal and context features adaptively. Note that our proposed
+methods have the great potential to facilitate a broader range of other
+conversational multimodal tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DialogRE^C+: An Extension of DialogRE to Investigate How Much
+  Coreference Helps Relation Extraction in Dialogs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyun Xiong, Mengwei Dai, Fei Li, Hao Fei, Bobo Li, Shengqiong Wu, Donghong Ji, Chong Teng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue relation extraction (DRE) that identifies the relations between
+argument pairs in dialogue text, suffers much from the frequent occurrence of
+personal pronouns, or entity and speaker coreference. This work introduces a
+new benchmark dataset DialogRE^C+, introducing coreference resolution into the
+DRE scenario. With the aid of high-quality coreference knowledge, the reasoning
+of argument relations is expected to be enhanced. In DialogRE^C+ dataset, we
+manually annotate total 5,068 coreference chains over 36,369 argument mentions
+based on the existing DialogRE data, where four different coreference chain
+types namely speaker chain, person chain, location chain and organization chain
+are explicitly marked. We further develop 4 coreference-enhanced graph-based
+DRE models, which learn effective coreference representations for improving the
+DRE task. We also train a coreference resolution model based on our annotations
+and evaluate the effect of automatically extracted coreference chains
+demonstrating the practicality of our dataset and its potential to other
+domains and tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpinionConv: Conversational Product Search with Grounded Opinions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahid Sadiri Javadi, Martin Potthast, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When searching for products, the opinions of others play an important role in
+making informed decisions. Subjective experiences about a product can be a
+valuable source of information. This is also true in sales conversations, where
+a customer and a sales assistant exchange facts and opinions about products.
+However, training an AI for such conversations is complicated by the fact that
+language models do not possess authentic opinions for their lack of real-world
+experience. We address this problem by leveraging product reviews as a rich
+source of product opinions to ground conversational AI in true subjective
+narratives. With OpinionConv, we develop the first conversational AI for
+simulating sales conversations. To validate the generated conversations, we
+conduct several user studies showing that the generated opinions are perceived
+as realistic. Our assessors also confirm the importance of opinions as an
+informative basis for decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gzip versus bag-of-words for text classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15002v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15002v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juri Opitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of compression in text classification ('gzip') has recently
+garnered lots of attention. In this note we show that `bag-of-words' approaches
+can achieve similar or better results, and are more efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>improved writing, extended with more results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Arabic Named Entity Recognition: Past, Recent Advances, and
+  Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03512v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03512v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoye Qu, Yingjie Gu, Qingrong Xia, Zechang Li, Zhefeng Wang, Baoxing Huai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As more and more Arabic texts emerged on the Internet, extracting important
+information from these Arabic texts is especially useful. As a fundamental
+technology, Named entity recognition (NER) serves as the core component in
+information extraction technology, while also playing a critical role in many
+other Natural Language Processing (NLP) systems, such as question answering and
+knowledge graph building. In this paper, we provide a comprehensive review of
+the development of Arabic NER, especially the recent advances in deep learning
+and pre-trained language model. Specifically, we first introduce the background
+of Arabic NER, including the characteristics of Arabic and existing resources
+for Arabic NER. Then, we systematically review the development of Arabic NER
+methods. Traditional Arabic NER systems focus on feature engineering and
+designing domain-specific rules. In recent years, deep learning methods achieve
+significant progress by representing texts via continuous vector
+representations. With the growth of pre-trained language model, Arabic NER
+yields better performance. Finally, we conclude the method gap between Arabic
+NER and NER methods from other languages, which helps outline future directions
+for Arabic NER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Large Language Models Really Good Logical Reasoners? A Comprehensive
+  Evaluation and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangzhi Xu, Qika Lin, Jiawei Han, Tianzhe Zhao, Jun Liu, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logical reasoning consistently plays a fundamental and significant role in
+the domains of knowledge engineering and artificial intelligence. Recently,
+Large Language Models (LLMs) have emerged as a noteworthy innovation in natural
+language processing (NLP), exhibiting impressive achievements across various
+classic NLP tasks. However, the question of whether LLMs can effectively
+address the task of logical reasoning, which requires gradual cognitive
+inference similar to human intelligence, remains unanswered. To this end, we
+aim to bridge this gap and provide comprehensive evaluations in this paper.
+Firstly, to offer systematic evaluations, we select fifteen typical logical
+reasoning datasets and organize them into deductive, inductive, abductive and
+mixed-form reasoning settings. Considering the comprehensiveness of
+evaluations, we include three representative LLMs (i.e., text-davinci-003,
+ChatGPT and BARD) and evaluate them on all selected datasets under zero-shot,
+one-shot and three-shot settings. Secondly, different from previous evaluations
+relying only on simple metrics (e.g., accuracy), we propose fine-level
+evaluations from objective and subjective manners, covering both answers and
+explanations. Additionally, to uncover the logical flaws of LLMs, problematic
+cases will be attributed to five error types from two dimensions, i.e.,
+evidence selection process and reasoning process. Thirdly, to avoid the
+influences of knowledge bias and purely focus on benchmarking the logical
+reasoning capability of LLMs, we propose a new dataset with neutral content. It
+contains 3,000 samples and covers deductive, inductive and abductive settings.
+Based on the in-depth evaluations, this paper finally forms a general
+evaluation scheme of logical reasoning capability from six dimensions. It
+reflects the pros and cons of LLMs and gives guiding directions for future
+works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Mathematical Derivations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09998v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09998v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Meadows, Marco Valentino, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The derivation of mathematical results in specialised fields, using Large
+Language Models (LLMs), is an emerging research direction that can help
+identify models' limitations, and potentially support mathematical discovery.
+In this paper, we leverage a symbolic engine to generate derivations of
+equations at scale, and investigate the capabilities of LLMs when deriving goal
+equations from premises. Specifically, we employ in-context learning for GPT
+and fine-tune a range of T5 models to compare the robustness and generalisation
+of pre-training strategies to specialised models. Empirical results show that
+fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and
+out-of-distribution test sets in conventional scores. However, an in-depth
+analysis reveals that the fine-tuned models are more sensitive to perturbations
+involving unseen symbols and (to a lesser extent) changes to equation
+structure. In addition, we analyse 1.7K equations, and over 200 derivations, to
+highlight common reasoning errors such as the inclusion of incorrect,
+irrelevant, and redundant equations. Finally, we explore the suitability of
+existing metrics for evaluating mathematical derivations and find evidence
+that, while they can capture general properties such as sensitivity to
+perturbations, they fail to highlight fine-grained reasoning errors and
+essential differences between models. Overall, this work demonstrates that
+training models on synthetic data may improve their math capabilities beyond
+much larger LLMs, but current metrics are not appropriately assessing the
+quality of generated mathematical text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topological Interpretations of <span class="highlight-title">GPT</span>-3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Sun, Bradley Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is an experiential study of investigating a consistent method for
+deriving the correlation between sentence vector and semantic meaning of a
+sentence. We first used three state-of-the-art word/sentence embedding methods
+including GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence
+strings into high dimensional spaces. Then we compute the pairwise distance
+between any possible combination of two sentence vectors in an embedding space
+and map them into a matrix. Based on each distance matrix, we compute the
+correlation of distances of a sentence vector with respect to the other
+sentence vectors in an embedding space. Then we compute the correlation of each
+pair of the distance matrices. We observed correlations of the same sentence in
+different embedding spaces and correlations of different sentences in the same
+embedding space. These observations are consistent with our hypothesis and take
+us to the next stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speech Separation based on Contrastive Learning and Deep Modularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Ochieng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current monaural state of the art tools for speech separation relies on
+supervised learning. This means that they must deal with permutation problem,
+they are impacted by the mismatch on the number of speakers used in training
+and inference. Moreover, their performance heavily relies on the presence of
+high-quality labelled data. These problems can be effectively addressed by
+employing a fully unsupervised technique for speech separation. In this paper,
+we use contrastive learning to establish the representations of frames then use
+the learned representations in the downstream deep modularization task.
+Concretely, we demonstrate experimentally that in speech separation, different
+frames of a speaker can be viewed as augmentations of a given hidden standard
+frame of that speaker. The frames of a speaker contain enough prosodic
+information overlap which is key in speech separation. Based on this, we
+implement a self-supervised learning to learn to minimize the distance between
+frames belonging to a given speaker. The learned representations are used in a
+downstream deep modularization task to cluster frames based on speaker
+identity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix
+shows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively
+in WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7
+respectively in WSJ0-2mix. Its greatest strength being that as the number of
+speakers increase, its performance does not degrade significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2212.00369</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Large Language Models for Topic Classification in the Domain
+  of Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Ignacio Serna, Javier Ortega-Garcia, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis of public affairs documents is crucial for citizens as it
+promotes transparency, accountability, and informed decision-making. It allows
+citizens to understand government policies, participate in public discourse,
+and hold representatives accountable. This is crucial, and sometimes a matter
+of life or death, for companies whose operation depend on certain regulations.
+Large Language Models (LLMs) have the potential to greatly enhance the analysis
+of public affairs documents by effectively processing and understanding the
+complex language used in such documents. In this work, we analyze the
+performance of LLMs in classifying public affairs documents. As a natural
+multi-label task, the classification of these documents presents important
+challenges. In this work, we use a regex-powered tool to collect a database of
+public affairs documents with more than 33K samples and 22.5M tokens. Our
+experiments assess the performance of 4 different Spanish LLMs to classify up
+to 30 different topics in the data in different configurations. The results
+shows that LLMs can be of great use to process domain-specific documents, such
+as those in the domain of public affairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Automatic Domain-Adapted and
+  Personalized Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whats New? Identifying the Unfolding of New Events in Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07748v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07748v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mahed Mousavi, Shohei Tanaka, Gabriel Roccabruna, Koichiro Yoshino, Satoshi Nakamura, Giuseppe Riccardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Narratives include a rich source of events unfolding over time and context.
+Automatic understanding of these events provides a summarised comprehension of
+the narrative for further computation (such as reasoning). In this paper, we
+study the Information Status (IS) of the events and propose a novel challenging
+task: the automatic identification of new events in a narrative. We define an
+event as a triplet of subject, predicate, and object. The event is categorized
+as new with respect to the discourse context and whether it can be inferred
+through commonsense reasoning. We annotated a publicly available corpus of
+narratives with the new events at sentence level using human annotators. We
+present the annotation protocol and study the quality of the annotation and the
+difficulty of the task. We publish the annotated dataset, annotation materials,
+and machine learning baseline models for the task of new event extraction for
+narrative understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recycle<span class="highlight-title">GPT</span>: An Autoregressive Language Model with Recyclable Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Jiang, Qiaozhi He, Xiaomin Zhuang, Zhihua Wu, Kunpeng Wang, Wenlai Zhao, Guangwen Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing large language models have to run K times to generate a sequence of
+K tokens. In this paper, we present RecycleGPT, a generative language model
+with fast decoding speed by recycling pre-generated model states without
+running the whole model in multiple steps. Our approach relies on the
+observation that adjacent tokens in a sequence usually have strong correlations
+and the next token in a sequence can be reasonably guessed or inferred based on
+the preceding ones. Experiments and analysis demonstrate the effectiveness of
+our approach in lowering inference latency, achieving up to 1.4x speedup while
+preserving high performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Controllable Text Generation using <span class="highlight-title">Transformer</span>-based
+  <span class="highlight-title">Pre-train</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05337v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05337v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanqing Zhang, Haolin Song, Shaoyu Li, Ming Zhou, Dawei Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable Text Generation (CTG) is emerging area in the field of natural
+language generation (NLG). It is regarded as crucial for the development of
+advanced text generation technologies that better meet the specific constraints
+in practical applications. In recent years, methods using large-scale
+pre-trained language models (PLMs), in particular the widely used
+transformer-based PLMs, have become a new paradigm of NLG, allowing generation
+of more diverse and fluent text. However, due to the limited level of
+interpretability of deep neural networks, the controllability of these methods
+need to be guaranteed. To this end, controllable text generation using
+transformer-based PLMs has become a rapidly growing yet challenging new
+research hotspot. A diverse range of approaches have emerged in the recent 3-4
+years, targeting different CTG tasks that require different types of controlled
+constraints. In this paper, we present a systematic critical review on the
+common tasks, main approaches, and evaluation methods in this area. Finally, we
+discuss the challenges that the field is facing, and put forward various
+promising future directions. To the best of our knowledge, this is the first
+survey paper to summarize the state-of-the-art CTG techniques from the
+perspective of Transformer-based PLMs. We hope it can help researchers and
+practitioners in the related fields to quickly track the academic and
+technological frontier, providing them with a landscape of the area and a
+roadmap for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Reusability of <span class="highlight-title">Pre-train</span>ed Language Models in Real-world
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somayeh Ghanbarzadeh, Hamid Palangi, Yan Huang, Radames Cruz Moreno, Hamed Khanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is
+often limited by their generalization problem, where their performance
+drastically decreases when evaluated on examples that differ from the training
+dataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation
+arises from PLMs' reliance on spurious correlations, which work well for
+frequent example types but not for general examples. To address this issue, we
+propose a training approach called Mask-tuning, which integrates Masked
+Language Modeling (MLM) training objectives into the fine-tuning process to
+enhance PLMs' generalization. Comprehensive experiments demonstrate that
+Mask-tuning surpasses current state-of-the-art techniques and enhances PLMs'
+generalization on OOD datasets while improving their performance on
+in-distribution datasets. The findings suggest that Mask-tuning improves the
+reusability of PLMs on unseen data, making them more practical and effective
+for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper and awarded as the BEST Resaerch Paper in
+  IEEE IRI'23 (IEEE 24th International conference on Information Reuse and
+  Integrationfor Data Science)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Yanni Hu, Yuguang Yang, Jixun Yao, Wen Fei, Lei Ma, Heng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining approaches have
+recently exhibited impressive success in diverse fields. In this paper, we
+propose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive
+language-audio pretraining (CLAP) method for speech emotion recognition.
+Specifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing
+pre-trained WavLM and RoBERTa models. Second, given the significance of the
+gender attribute in speech emotion modeling, two novel soft label based
+GEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)
+models are further proposed to integrate emotion and gender information of
+speech signals, forming more reasonable objectives. Extensive experiments on
+IEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the
+baseline Emo-CLAP, while also achieving the best recognition performance
+compared with recent state-of-the-art methods. Noticeably, the proposed
+SL-GEmo-CLAP model achieves the best UAR of 81.43\% and WAR of 83.16\% which
+performs better than other state-of-the-art SER methods by at least 3\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NBIAS: A Natural Language Processing Framework for Bias Identification
+  in Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Muskan Garg, Deepak John Reji, Syed Raza Bashir, Chen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias in textual data can lead to skewed interpretations and outcomes when the
+data is used. These biases could perpetuate stereotypes, discrimination, or
+other forms of unfair treatment. An algorithm trained on biased data ends up
+making decisions that disproportionately impact a certain group of people.
+Therefore, it is crucial to detect and remove these biases to ensure the fair
+and ethical use of data. To this end, we develop a comprehensive and robust
+framework \textsc{Nbias} that consists of a data layer, corpus contruction,
+model development layer and an evaluation layer. The dataset is constructed by
+collecting diverse data from various fields, including social media,
+healthcare, and job hiring portals. As such, we applied a transformer-based
+token classification model that is able to identify bias words/ phrases through
+a unique named entity. In the assessment procedure, we incorporate a blend of
+quantitative and qualitative evaluations to gauge the effectiveness of our
+models. We achieve accuracy improvements ranging from 1% to 8% compared to
+baselines. We are also able to generate a robust understanding of the model
+functioning, capturing not only numerical data but also the quality and
+intricacies of its performance. The proposed approach is applicable to a
+variety of biases and contributes to the fair and ethical use of textual data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited
+  Reference Diversity in NLG Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfeng Zeng, Yijin Liu, Fandong Meng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely
+utilized across a range of natural language generation (NLG) tasks. However,
+recent studies have revealed a weak correlation between these matching-based
+metrics and human evaluations, especially when compared with neural-based
+metrics like BLEURT. In this paper, we conjecture that the performance
+bottleneck in matching-based metrics may be caused by the limited diversity of
+references. To address this issue, we propose to utilize \textit{multiple
+references} to enhance the consistency between these metrics and human
+evaluations. Within the WMT Metrics benchmarks, we observe that the
+multi-references F200spBLEU surpasses the conventional single-reference one by
+an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based
+BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the
+data leakage issue in large language models (LLMs) can be mitigated to a large
+extent by our multi-reference metric. We release the code and data at
+\url{https://github.com/SefaZeng/LLM-Ref}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoHint: Automatic <span class="highlight-title">Prompt</span> Optimization with Hint Generation <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Sun, Xue Li, Yinchuan Xu, Youkow Homma, Qi Cao, Min Wu, Jian Jiao, Denis Charles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents AutoHint, a novel framework for automatic prompt
+engineering and optimization for Large Language Models (LLM). While LLMs have
+demonstrated remarkable ability in achieving high-quality annotation in various
+tasks, the key to applying this ability to specific tasks lies in developing
+high-quality prompts. Thus we propose a framework to inherit the merits of both
+in-context learning and zero-shot learning by incorporating enriched
+instructions derived from input-output demonstrations to optimize original
+prompt. We refer to the enrichment as the hint and propose a framework to
+automatically generate the hint from labeled data. More concretely, starting
+from an initial prompt, our method first instructs a LLM to deduce new hints
+for selected samples from incorrect predictions, and then summarizes from
+per-sample hints and adds the results back to the initial prompt to form a new,
+enriched instruction. The proposed method is evaluated on the BIG-Bench
+Instruction Induction dataset for both zero-shot and few-short prompts, where
+experiments demonstrate our method is able to significantly boost accuracy for
+multiple tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023: Foundations and Applications in Large-scale AI
+  Models-Pre-training, Fine-tuning, and Prompt-based Learning workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-Effective Hyperparameter Optimization for Large Language Model
+  Generation Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have sparked significant interest in their
+generative capabilities, leading to the development of various commercial
+applications. The high cost of using the models drives application builders to
+maximize the value of generation under a limited inference budget. This paper
+presents a study of optimizing inference hyperparameters such as the number of
+responses, temperature and max tokens, which significantly affects the
+utility/cost of text generation. We design a framework named EcoOptiGen which
+leverages economical hyperparameter optimization and cost-based pruning.
+Experiments with the GPT-3.5/GPT-4 models on a variety of tasks verify its
+effectiveness. EcoOptiGen is implemented in the `autogen' package of the FLAML
+library: \url{https://aka.ms/autogen}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Universal Question-Answering Platform for Knowledge Graphs <span class="chip">SIGMOD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reham Omar, Ishika Dhall, Panos Kalnis, Essam Mansour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge from diverse application domains is organized as knowledge graphs
+(KGs) that are stored in RDF engines accessible in the web via SPARQL
+endpoints. Expressing a well-formed SPARQL query requires information about the
+graph structure and the exact URIs of its components, which is impractical for
+the average user. Question answering (QA) systems assist by translating natural
+language questions to SPARQL. Existing QA systems are typically based on
+application-specific human-curated rules, or require prior information,
+expensive pre-processing and model adaptation for each targeted KG. Therefore,
+they are hard to generalize to a broad set of applications and KGs.
+  In this paper, we propose KGQAn, a universal QA system that does not need to
+be tailored to each target KG. Instead of curated rules, KGQAn introduces a
+novel formalization of question understanding as a text generation problem to
+convert a question into an intermediate abstract representation via a neural
+sequence-to-sequence model. We also develop a just-in-time linker that maps at
+query time the abstract representation to a SPARQL query for a specific KG,
+using only the publicly accessible APIs and the existing indices of the RDF
+store, without requiring any pre-processing. Our experiments with several real
+KGs demonstrate that KGQAn is easily deployed and outperforms by a large margin
+the state-of-the-art in terms of quality of answers and processing time,
+especially for arbitrary KGs, unseen during the training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is accepted to SIGMOD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">137</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When More is Less: Incorporating Additional <span class="highlight-title">Dataset</span>s Can Hurt
+  Performance By Introducing Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rhys Compton, Lily Zhang, Aahlad Puli, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, incorporating more data is often seen as a reliable
+strategy for improving model performance; this work challenges that notion by
+demonstrating that the addition of external datasets in many cases can hurt the
+resulting model's performance. In a large-scale empirical study across
+combinations of four different open-source chest x-ray datasets and 9 different
+labels, we demonstrate that in 43% of settings, a model trained on data from
+two hospitals has poorer worst group accuracy over both hospitals than a model
+trained on just a single hospital's data. This surprising result occurs even
+though the added hospital makes the training distribution more similar to the
+test distribution. We explain that this phenomenon arises from the spurious
+correlation that emerges between the disease and hospital, due to
+hospital-specific image artifacts. We highlight the trade-off one encounters
+when training on multiple datasets, between the obvious benefit of additional
+data and insidious cost of the introduced spurious correlation. In some cases,
+balancing the dataset can remove the spurious correlation and improve
+performance, but it is not always an effective strategy. We contextualize our
+results within the literature on spurious correlations to help explain these
+outcomes. Our experiments underscore the importance of exercising caution when
+selecting training data for machine learning models, especially in settings
+where there is a risk of spurious correlations such as with medical imaging.
+The risks outlined highlight the need for careful data selection and model
+evaluation in future research and practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MLHC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep-Learning Method Using Auto-encoder and Generative Adversarial
+  Network for Anomaly Detection on Ancient Stone Stele Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Liu, Yuning Wang, Cheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of natural deterioration and man-made damage on the
+surfaces of ancient stele in the first instance is essential for their
+preventive conservation. Existing methods for cultural heritage preservation
+are not able to achieve this goal perfectly due to the difficulty of balancing
+accuracy, efficiency, timeliness, and cost. This paper presents a deep-learning
+method to automatically detect above mentioned emergencies on ancient stone
+stele in real time, employing autoencoder (AE) and generative adversarial
+network (GAN). The proposed method overcomes the limitations of existing
+methods by requiring no extensive anomaly samples while enabling comprehensive
+detection of unpredictable anomalies. the method includes stages of monitoring,
+data acquisition, pre-processing, model structuring, and post-processing.
+Taking the Longmen Grottoes' stone steles as a case study, an unsupervised
+learning model based on AE and GAN architectures is proposed and validated with
+a reconstruction accuracy of 99.74\%. The method's evaluation revealed the
+proficient detection of seven artificially designed anomalies and demonstrated
+precision and reliability without false alarms. This research provides novel
+ideas and possibilities for the application of deep learning in the field of
+cultural heritage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from
+  Optical Satellite Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Yu Zhang, Shiying Wang, Lei Jin, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical satellite images are a critical data source; however, cloud cover
+often compromises their quality, hindering image applications and analysis.
+Consequently, effectively removing clouds from optical satellite images has
+emerged as a prominent research direction. While recent advancements in cloud
+removal primarily rely on generative adversarial networks, which may yield
+suboptimal image quality, diffusion models have demonstrated remarkable success
+in diverse image-generation tasks, showcasing their potential in addressing
+this challenge. This paper presents a novel framework called DiffCR, which
+leverages conditional guided diffusion with deep convolutional networks for
+high-performance cloud removal for optical satellite imagery. Specifically, we
+introduce a decoupled encoder for conditional image feature extraction,
+providing a robust color representation to ensure the close similarity of
+appearance information between the conditional input and the synthesized
+output. Moreover, we propose a novel and efficient time and condition fusion
+block within the cloud removal model to accurately simulate the correspondence
+between the appearance in the conditional image and the target image at a low
+computational cost. Extensive experimental evaluations on two commonly used
+benchmark datasets demonstrate that DiffCR consistently achieves
+state-of-the-art performance on all metrics, with parameter and computational
+complexities amounting to only 5.1% and 5.4%, respectively, of those previous
+best methods. The source code, pre-trained models, and all the experimental
+results will be publicly available at https://github.com/XavierJiezou/DiffCR
+upon the paper's acceptance of this work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digging into Depth Priors for Outdoor Neural Radiance Fields <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Jiadai Sun, Lina Liu, Chenming Wu, Zhelun Shen, Dayan Wu, Yuchao Dai, Liangjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) have demonstrated impressive performance in
+vision and graphics tasks, such as novel view synthesis and immersive reality.
+However, the shape-radiance ambiguity of radiance fields remains a challenge,
+especially in the sparse viewpoints setting. Recent work resorts to integrating
+depth priors into outdoor NeRF training to alleviate the issue. However, the
+criteria for selecting depth priors and the relative merits of different priors
+have not been thoroughly investigated. Moreover, the relative merits of
+selecting different approaches to use the depth priors is also an unexplored
+problem. In this paper, we provide a comprehensive study and evaluation of
+employing depth priors to outdoor neural radiance fields, covering common depth
+sensing technologies and most application ways. Specifically, we conduct
+extensive experiments with two representative NeRF methods equipped with four
+commonly-used depth priors and different depth usages on two widely used
+outdoor datasets. Our experimental results reveal several interesting findings
+that can potentially benefit practitioners and researchers in training their
+NeRF models with depth priors. Project Page:
+https://cwchenwang.github.io/outdoor-nerf-depth
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023. Project Page:
+  https://cwchenwang.github.io/outdoor-nerf-depth</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V-DETR: DETR with Vertex Relative Position Encoding for 3D Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Shen, Zigang Geng, Yuhui Yuan, Yutong Lin, Ze Liu, Chunyu Wang, Han Hu, Nanning Zheng, Baining Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a highly performant 3D object detector for point clouds using
+the DETR framework. The prior attempts all end up with suboptimal results
+because they fail to learn accurate inductive biases from the limited scale of
+training data. In particular, the queries often attend to points that are far
+away from the target objects, violating the locality principle in object
+detection. To address the limitation, we introduce a novel 3D Vertex Relative
+Position Encoding (3DV-RPE) method which computes position encoding for each
+point based on its relative position to the 3D boxes predicted by the queries
+in each decoder layer, thus providing clear information to guide the model to
+focus on points near the objects, in accordance with the principle of locality.
+In addition, we systematically improve the pipeline from various aspects such
+as data normalization based on our understanding of the task. We show
+exceptional results on the challenging ScanNetV2 benchmark, achieving
+significant improvements over the previous 3DETR in
+$\rm{AP}_{25}$/$\rm{AP}_{50}$ from 65.0\%/47.0\% to 77.8\%/66.0\%,
+respectively. In addition, our method sets a new record on ScanNetV2 and SUN
+RGB-D datasets.Code will be released at http://github.com/yichaoshen-MS/V-DETR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Person Re-Identification without Identification via Event Anonymization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafiq Ahmad, Pietro Morerio, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide-scale use of visual surveillance in public spaces puts individual
+privacy at stake while increasing resource consumption (energy, bandwidth, and
+computation). Neuromorphic vision sensors (event-cameras) have been recently
+considered a valid solution to the privacy issue because they do not capture
+detailed RGB visual information of the subjects in the scene. However, recent
+deep learning architectures have been able to reconstruct images from event
+cameras with high fidelity, reintroducing a potential threat to privacy for
+event-based vision applications. In this paper, we aim to anonymize
+event-streams to protect the identity of human subjects against such image
+reconstruction attacks. To achieve this, we propose an end-to-end network
+architecture jointly optimized for the twofold objective of preserving privacy
+and performing a downstream task such as person ReId. Our network learns to
+scramble events, enforcing the degradation of images recovered from the privacy
+attacker. In this work, we also bring to the community the first ever
+event-based person ReId dataset gathered to evaluate the performance of our
+approach. We validate our approach with extensive experiments and report
+results on the synthetic event data simulated from the publicly available
+SoftBio dataset and our proposed Event-ReId dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEFormer: A Hybrid CNN-<span class="highlight-title">Transformer</span> Architecture for Accurate Lake
+  Extraction from Remote Sensing Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Chen, Xuechao Zou, Yu Zhang, Jiayu Li, Kai Li, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lake extraction from remote sensing imagery is challenging due to the complex
+shapes of lakes and the presence of noise. Existing methods suffer from blurred
+segmentation boundaries and poor foreground modeling. In this paper, we propose
+a hybrid CNN-Transformer architecture, called LEFormer, for accurate lake
+extraction. LEFormer contains four main modules: CNN encoder, Transformer
+encoder, cross-encoder fusion, and lightweight decoder. The CNN encoder
+recovers local spatial information and improves fine-scale details.
+Simultaneously, the Transformer encoder captures long-range dependencies
+between sequences of any length, allowing them to obtain global features and
+context information better. Finally, a lightweight decoder is employed for mask
+prediction. We evaluate the performance and efficiency of LEFormer on two
+datasets, the Surface Water (SW) and the Qinghai-Tibet Plateau Lake (QTPL).
+Experimental results show that LEFormer consistently achieves state-of-the-art
+(SOTA) performance and efficiency on these two datasets, outperforming existing
+methods. Specifically, LEFormer achieves 90.86% and 97.42% mIoU on the SW and
+QTPL datasets with a parameter count of 3.61M, respectively, while being 20x
+minor than the previous SOTA method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation-Based Unsupervised Domain Adaptation In Medical
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Nørgaard Llambias, Mads Nielsen, Mostafa Mehdipour Ghazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based models in medical imaging often struggle to generalize
+effectively to new scans due to data heterogeneity arising from differences in
+hardware, acquisition parameters, population, and artifacts. This limitation
+presents a significant challenge in adopting machine learning models for
+clinical practice. We propose an unsupervised method for robust domain
+adaptation in brain MRI segmentation by leveraging MRI-specific augmentation
+techniques. To evaluate the effectiveness of our method, we conduct extensive
+experiments across diverse datasets, modalities, and segmentation tasks,
+comparing against the state-of-the-art methods. The results show that our
+proposed approach achieves high accuracy, exhibits broad applicability, and
+showcases remarkable robustness against domain shift in various tasks,
+surpassing the state-of-the-art performance in the majority of cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point
+  Clouds <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chensheng Peng, Guangming Wang, Xian Wan Lo, Xinrui Wu, Chenfeng Xu, Masayoshi Tomizuka, Wei Zhan, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds are naturally sparse, while image pixels are dense. The
+inconsistency limits feature fusion from both modalities for point-wise scene
+flow estimation. Previous methods rarely predict scene flow from the entire
+point clouds of the scene with one-time inference due to the memory
+inefficiency and heavy overhead from distance calculation and sorting involved
+in commonly used farthest point sampling, KNN, and ball query algorithms for
+local feature aggregation. To mitigate these issues in scene flow learning, we
+regularize raw points to a dense format by storing 3D coordinates in 2D grids.
+Unlike the sampling operation commonly used in existing works, the dense 2D
+representation 1) preserves most points in the given scene, 2) brings in a
+significant boost of efficiency, and 3) eliminates the density gap between
+points and pixels, allowing us to perform effective feature fusion. We also
+present a novel warping projection technique to alleviate the information loss
+problem resulting from the fact that multiple points could be mapped into one
+grid during projection when computing cost volume. Sufficient experiments
+demonstrate the efficiency and effectiveness of our method, outperforming the
+prior-arts on the FlyingThings3D and KITTI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes will be released at
+  https://github.com/IRMVLab/DELFlow</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pelta: Shielding <span class="highlight-title">Transformer</span>s to Mitigate Evasion Attacks in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Queyrut, Yérom-David Bromberg, Valerio Schiavoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main premise of federated learning is that machine learning model updates
+are computed locally, in particular to preserve user data privacy, as those
+never leave the perimeter of their device. This mechanism supposes the general
+model, once aggregated, to be broadcast to collaborating and non malicious
+nodes. However, without proper defenses, compromised clients can easily probe
+the model inside their local memory in search of adversarial examples. For
+instance, considering image-based applications, adversarial examples consist of
+imperceptibly perturbed images (to the human eye) misclassified by the local
+model, which can be later presented to a victim node's counterpart model to
+replicate the attack. To mitigate such malicious probing, we introduce Pelta, a
+novel shielding mechanism leveraging trusted hardware. By harnessing the
+capabilities of Trusted Execution Environments (TEEs), Pelta masks part of the
+back-propagation chain rule, otherwise typically exploited by attackers for the
+design of malicious samples. We evaluate Pelta on a state of the art ensemble
+model and demonstrate its effectiveness against the Self Attention Gradient
+adversarial Attack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Super-Resolution Meets Camouflaged Object Detection: A Comparison
+  Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Wen, Shupeng Cheng, Peng Xu, Bowen Zhou, Radu Timofte, Weiyan Hou, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot
+topics in computer vision with various joint applications. For instance,
+low-resolution surveillance images can be successively processed by
+super-resolution techniques and camouflaged object detection. However, in
+previous work, these two areas are always studied in isolation. In this paper,
+we, for the first time, conduct an integrated comparative evaluation for both.
+Specifically, we benchmark different super-resolution methods on commonly used
+COD datasets, and meanwhile, we evaluate the robustness of different COD models
+by using COD data processed by SR methods. Our goal is to bridge these two
+domains, discover novel experimental phenomena, summarize new experim.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages with 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support
+  <span class="highlight-title">Transformer</span> for Frame-Event based Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Zongzhen Wu, Yao Rong, Lin Zhu, Bo Jiang, Jin Tang, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event camera-based pattern recognition is a newly arising research topic in
+recent years. Current researchers usually transform the event streams into
+images, graphs, or voxels, and adopt deep neural networks for event-based
+classification. Although good performance can be achieved on simple event
+recognition datasets, however, their results may be still limited due to the
+following two issues. Firstly, they adopt spatial sparse event streams for
+recognition only, which may fail to capture the color and detailed texture
+information well. Secondly, they adopt either Spiking Neural Networks (SNN) for
+energy-efficient recognition with suboptimal results, or Artificial Neural
+Networks (ANN) for energy-intensive, high-performance recognition. However,
+seldom of them consider achieving a balance between these two aspects. In this
+paper, we formally propose to recognize patterns by fusing RGB frames and event
+streams simultaneously and propose a new RGB frame-event recognition framework
+to address the aforementioned issues. The proposed method contains four main
+modules, i.e., memory support Transformer network for RGB frame encoding,
+spiking neural network for raw event stream encoding, multi-modal bottleneck
+fusion module for RGB-Event feature aggregation, and prediction head. Due to
+the scarce of RGB-Event based classification dataset, we also propose a
+large-scale PokerEvent dataset which contains 114 classes, and 27102
+frame-event pairs recorded using a DVS346 event camera. Extensive experiments
+on two RGB-Event based classification datasets fully validated the
+effectiveness of our proposed framework. We hope this work will boost the
+development of pattern recognition by fusing RGB frames and event streams. Both
+our dataset and source code of this work will be released at
+https://github.com/Event-AHU/SSTFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Unbiased Image Segmentation: A Case Study with Plain Knee
+  Radiographs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nickolas Littlefield, Johannes F. Plate, Kurt R. Weiss, Ines Lohse, Avani Chhabra, Ismaeel A. Siddiqui, Zoe Menezes, George Mastorakos, Sakshi Mehul Thakar, Mehrnaz Abedian, Matthew F. Gong, Luke A. Carlson, Hamidreza Moradi, Soheyla Amirian, Ahmad P. Tafti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic segmentation of knee bony anatomy is essential in orthopedics, and
+it has been around for several years in both pre-operative and post-operative
+settings. While deep learning algorithms have demonstrated exceptional
+performance in medical image analysis, the assessment of fairness and potential
+biases within these models remains limited. This study aims to revisit deep
+learning-powered knee-bony anatomy segmentation using plain radiographs to
+uncover visible gender and racial biases. The current contribution offers the
+potential to advance our understanding of biases, and it provides practical
+insights for researchers and practitioners in medical imaging. The proposed
+mitigation strategies mitigate gender and racial biases, ensuring fair and
+unbiased segmentation results. Furthermore, this work promotes equal access to
+accurate diagnoses and treatment outcomes for diverse patient populations,
+fostering equitable and inclusive healthcare provision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE BHI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-VisTA: <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span> for 3D Vision and Text Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Zhu, Xiaojian Ma, Yixin Chen, Zhidong Deng, Siyuan Huang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D vision-language grounding (3D-VL) is an emerging field that aims to
+connect the 3D physical world with natural language, which is crucial for
+achieving embodied intelligence. Current 3D-VL models rely heavily on
+sophisticated modules, auxiliary losses, and optimization tricks, which calls
+for a simple and unified model. In this paper, we propose 3D-VisTA, a
+pre-trained Transformer for 3D Vision and Text Alignment that can be easily
+adapted to various downstream tasks. 3D-VisTA simply utilizes self-attention
+layers for both single-modal modeling and multi-modal fusion without any
+sophisticated task-specific design. To further enhance its performance on 3D-VL
+tasks, we construct ScanScribe, the first large-scale 3D scene-text pairs
+dataset for 3D-VL pre-training. ScanScribe contains 2,995 RGB-D scans for 1,185
+unique indoor scenes originating from ScanNet and 3R-Scan datasets, along with
+paired 278K scene descriptions generated from existing 3D-VL tasks, templates,
+and GPT-3. 3D-VisTA is pre-trained on ScanScribe via masked language/object
+modeling and scene-text matching. It achieves state-of-the-art results on
+various 3D-VL tasks, ranging from visual grounding and dense captioning to
+question answering and situated reasoning. Moreover, 3D-VisTA demonstrates
+superior data efficiency, obtaining strong performance even with limited
+annotations during downstream task fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight and Accurate Face Detection Algorithm Based on Retinaface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baozhu Liu, Hewei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a lightweight and accurate face detection algorithm
+LAFD (Light and accurate face detection) based on Retinaface. Backbone network
+in the algorithm is a modified MobileNetV3 network which adjusts the size of
+the convolution kernel, the channel expansion multiplier of the inverted
+residuals block and the use of the SE attention mechanism. Deformable
+convolution network(DCN) is introduced in the context module and the algorithm
+uses focal loss function instead of cross-entropy loss function as the
+classification loss function of the model. The test results on the WIDERFACE
+dataset indicate that the average accuracy of LAFD is 94.1%, 92.2% and 82.1%
+for the "easy", "medium" and "hard" validation subsets respectively with an
+improvement of 3.4%, 4.0% and 8.3% compared to Retinaface and 3.1%, 4.1% and
+4.1% higher than the well-performing lightweight model, LFFD. If the input
+image is pre-processed and scaled to 1560px in length or 1200px in width, the
+model achieves an average accuracy of 86.2% on the 'hard' validation subset.
+The model is lightweight, with a size of only 10.2MB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pengembangan Model untuk Mendeteksi Kerusakan pada Terumbu Karang dengan
+  Klasifikasi Citra 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadhil Muhammad, Alif Bintang Elfandra, Iqbal Pahlevi Amin, Alfan Farizki Wicaksono
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The abundant biodiversity of coral reefs in Indonesian waters is a valuable
+asset that needs to be preserved. Rapid climate change and uncontrolled human
+activities have led to the degradation of coral reef ecosystems, including
+coral bleaching, which is a critical indicator of coral health conditions.
+Therefore, this research aims to develop an accurate classification model to
+distinguish between healthy corals and corals experiencing bleaching. This
+study utilizes a specialized dataset consisting of 923 images collected from
+Flickr using the Flickr API. The dataset comprises two distinct classes:
+healthy corals (438 images) and bleached corals (485 images). These images have
+been resized to a maximum of 300 pixels in width or height, whichever is
+larger, to maintain consistent sizes across the dataset.
+  The method employed in this research involves the use of machine learning
+models, particularly convolutional neural networks (CNN), to recognize and
+differentiate visual patterns associated with healthy and bleached corals. In
+this context, the dataset can be used to train and test various classification
+models to achieve optimal results. By leveraging the ResNet model, it was found
+that a from-scratch ResNet model can outperform pretrained models in terms of
+precision and accuracy. The success in developing accurate classification
+models will greatly benefit researchers and marine biologists in gaining a
+better understanding of coral reef health. These models can also be employed to
+monitor changes in the coral reef environment, thereby making a significant
+contribution to conservation and ecosystem restoration efforts that have
+far-reaching impacts on life.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Indonesian language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Adaptive Person Search via GAN-based Scene Synthesis for
+  Cross-scene Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huibing Wang, Tianxiang Cui, Mingze Yao, Huijuan Pang, Yushan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person search has recently been a challenging task in the computer vision
+domain, which aims to search specific pedestrians from real
+cameras.Nevertheless, most surveillance videos comprise only a handful of
+images of each pedestrian, which often feature identical backgrounds and
+clothing. Hence, it is difficult to learn more discriminative features for
+person search in real scenes. To tackle this challenge, we draw on Generative
+Adversarial Networks (GAN) to synthesize data from surveillance videos. GAN has
+thrived in computer vision problems because it produces high-quality images
+efficiently. We merely alter the popular Fast R-CNN model, which is capable of
+processing videos and yielding accurate detection outcomes. In order to
+appropriately relieve the pressure brought by the two-stage model, we design an
+Assisted-Identity Query Module (AIDQ) to provide positive images for the behind
+part. Besides, the proposed novel GAN-based Scene Synthesis model that can
+synthesize high-quality cross-id person images for person search tasks. In
+order to facilitate the feature learning of the GAN-based Scene Synthesis
+model, we adopt an online learning strategy that collaboratively learns the
+synthesized images and original images. Extensive experiments on two widely
+used person search benchmarks, CUHK-SYSU and PRW, have shown that our method
+has achieved great performance, and the extensive ablation study further
+justifies our GAN-synthetic data can effectively increase the variability of
+the datasets and be more realistic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All-pairs Consistency Learning for Weakly Supervised Semantic
+  Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixuan Sun, Yanhao Zhang, Zhen Qin, Zheyuan Liu, Lin Cheng, Fanyi Wang, Yiran Zhong, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a new transformer-based regularization to better
+localize objects for Weakly supervised semantic segmentation (WSSS). In
+image-level WSSS, Class Activation Map (CAM) is adopted to generate object
+localization as pseudo segmentation labels. To address the partial activation
+issue of the CAMs, consistency regularization is employed to maintain
+activation intensity invariance across various image augmentations. However,
+such methods ignore pair-wise relations among regions within each CAM, which
+capture context and should also be invariant across image views. To this end,
+we propose a new all-pairs consistency regularization (ACR). Given a pair of
+augmented views, our approach regularizes the activation intensities between a
+pair of augmented views, while also ensuring that the affinity across regions
+within each view remains consistent. We adopt vision transformers as the
+self-attention mechanism naturally embeds pair-wise affinity. This enables us
+to simply regularize the distance between the attention matrices of augmented
+image pairs. Additionally, we introduce a novel class-wise localization method
+that leverages the gradients of the class token. Our method can be seamlessly
+integrated into existing WSSS methods using transformers without modifying the
+architectures. We evaluate our method on PASCAL VOC and MS COCO datasets. Our
+method produces noticeably better class localization maps (67.3% mIoU on PASCAL
+VOC train), resulting in superior WSSS performances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vehicle Motion Forecasting using Prior Information and Semantic-assisted
+  Occupancy Grid Maps <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rabbia Asghar, Manuel Diaz-Zapata, Lukas Rummelhard, Anne Spalanzani, Christian Laugier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion prediction is a challenging task for autonomous vehicles due to
+uncertainty in the sensor data, the non-deterministic nature of future, and
+complex behavior of agents. In this paper, we tackle this problem by
+representing the scene as dynamic occupancy grid maps (DOGMs), associating
+semantic labels to the occupied cells and incorporating map information. We
+propose a novel framework that combines deep-learning-based spatio-temporal and
+probabilistic approaches to predict vehicle behaviors.Contrary to the
+conventional OGM prediction methods, evaluation of our work is conducted
+against the ground truth annotations. We experiment and validate our results on
+real-world NuScenes dataset and show that our model shows superior ability to
+predict both static and dynamic vehicles compared to OGM predictions.
+Furthermore, we perform an ablation study and assess the role of semantic
+labels and map in the architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cloth2Tex: A Customized Cloth Texture Generation Pipeline for 3D Virtual
+  Try-On 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daiheng Gao, Xu Chen, Xindi Zhang, Qi Wang, Ke Sun, Bang Zhang, Liefeng Bo, Qixing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fabricating and designing 3D garments has become extremely demanding with the
+increasing need for synthesizing realistic dressed persons for a variety of
+applications, e.g. 3D virtual try-on, digitalization of 2D clothes into 3D
+apparel, and cloth animation. It thus necessitates a simple and straightforward
+pipeline to obtain high-quality texture from simple input, such as 2D reference
+images. Since traditional warping-based texture generation methods require a
+significant number of control points to be manually selected for each type of
+garment, which can be a time-consuming and tedious process. We propose a novel
+method, called Cloth2Tex, which eliminates the human burden in this process.
+Cloth2Tex is a self-supervised method that generates texture maps with
+reasonable layout and structural consistency. Another key feature of Cloth2Tex
+is that it can be used to support high-fidelity texture inpainting. This is
+done by combining Cloth2Tex with a prevailing latent diffusion model. We
+evaluate our approach both qualitatively and quantitatively and demonstrate
+that Cloth2Tex can generate high-quality texture maps and achieve the best
+visual effects in comparison to other methods. Project page:
+tomguluson92.github.io/projects/cloth2tex/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Based Autonomous Navigation for Unmanned Surface Vessel in
+  Extreme Marine Conditions <span class="chip">IROS-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhayyuddin Ahmed, Ahsan Baidar Bakht, Taimur Hassan, Waseem Akram, Ahmed Humais, Lakmal Seneviratne, Shaoming He, Defu Lin, Irfan Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual perception is an important component for autonomous navigation of
+unmanned surface vessels (USV), particularly for the tasks related to
+autonomous inspection and tracking. These tasks involve vision-based navigation
+techniques to identify the target for navigation. Reduced visibility under
+extreme weather conditions in marine environments makes it difficult for
+vision-based approaches to work properly. To overcome these issues, this paper
+presents an autonomous vision-based navigation framework for tracking target
+objects in extreme marine conditions. The proposed framework consists of an
+integrated perception pipeline that uses a generative adversarial network (GAN)
+to remove noise and highlight the object features before passing them to the
+object detector (i.e., YOLOv5). The detected visual features are then used by
+the USV to track the target. The proposed framework has been thoroughly tested
+in simulation under extremely reduced visibility due to sandstorms and fog. The
+results are compared with state-of-the-art de-hazing methods across the
+benchmarked MBZIRC simulation dataset, on which the proposed scheme has
+outperformed the existing methods across various metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots (IROS-2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossy and Lossless (L$^2$) Post-training Model Size Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Shi, Shihao Bai, Xiuying Wei, Ruihao Gong, Jianlei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have delivered remarkable performance and have been
+widely used in various visual tasks. However, their huge size causes
+significant inconvenience for transmission and storage. Many previous studies
+have explored model size compression. However, these studies often approach
+various lossy and lossless compression methods in isolation, leading to
+challenges in achieving high compression ratios efficiently. This work proposes
+a post-training model size compression method that combines lossy and lossless
+compression in a unified way. We first propose a unified parametric weight
+transformation, which ensures different lossy compression methods can be
+performed jointly in a post-training manner. Then, a dedicated differentiable
+counter is introduced to guide the optimization of lossy compression to arrive
+at a more suitable point for later lossless compression. Additionally, our
+method can easily control a desired global compression ratio and allocate
+adaptive ratios for different layers. Finally, our method can achieve a stable
+$10\times$ compression ratio without sacrificing accuracy and a $20\times$
+compression ratio with minor accuracy loss in a short time. Our code is
+available at https://github.com/ModelTC/L2_Compression .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SDLFormer: A Sparse and Dense Locality-enhanced <span class="highlight-title">Transformer</span> for
+  Accelerated MR Image Reconstruction <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul G. S., Sriprabha Ramnarayanan, Mohammad Al Fahim, Keerthi Ram, Preejith S. P, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as viable alternatives to convolutional neural
+networks owing to their ability to learn non-local region relationships in the
+spatial domain. The self-attention mechanism of the transformer enables
+transformers to capture long-range dependencies in the images, which might be
+desirable for accelerated MRI image reconstruction as the effect of
+undersampling is non-local in the image domain. Despite its computational
+efficiency, the window-based transformers suffer from restricted receptive
+fields as the dependencies are limited to within the scope of the image
+windows. We propose a window-based transformer network that integrates dilated
+attention mechanism and convolution for accelerated MRI image reconstruction.
+The proposed network consists of dilated and dense neighborhood attention
+transformers to enhance the distant neighborhood pixel relationship and
+introduce depth-wise convolutions within the transformer module to learn
+low-level translation invariant features for accelerated MRI image
+reconstruction. The proposed model is trained in a self-supervised manner. We
+perform extensive experiments for multi-coil MRI acceleration for coronal PD,
+coronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in
+self-supervised learning based on k-space splitting. We compare our method
+against other reconstruction architectures and the parallel domain
+self-supervised learning baseline. Results show that the proposed model
+exhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in
+SSIM on average over other architectures (ii) around 1.44 dB in PSNR and around
+0.029 in SSIM over parallel domain self-supervised learning. The code is
+available at https://github.com/rahul-gs-16/sdlformer.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with
+  noisy and Limited Data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blur aware metric depth estimation with multi-focus plenoptic cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Labussière, Céline Teulière, Omar Ait-Aider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a traditional camera only captures one point of view of a scene, a
+plenoptic or light-field camera, is able to capture spatial and angular
+information in a single snapshot, enabling depth estimation from a single
+acquisition. In this paper, we present a new metric depth estimation algorithm
+using only raw images from a multi-focus plenoptic camera. The proposed
+approach is especially suited for the multi-focus configuration where several
+micro-lenses with different focal lengths are used. The main goal of our blur
+aware depth estimation (BLADE) approach is to improve disparity estimation for
+defocus stereo images by integrating both correspondence and defocus cues. We
+thus leverage blur information where it was previously considered a drawback.
+We explicitly derive an inverse projection model including the defocus blur
+providing depth estimates up to a scale factor. A method to calibrate the
+inverse model is then proposed. We thus take into account depth scaling to
+achieve precise and accurate metric depth estimates. Our results show that
+introducing defocus cues improves the depth estimation. We demonstrate the
+effectiveness of our framework and depth scaling calibration on relative depth
+estimation setups and on real-world 3D complex scenes with ground truth
+acquired with a 3D lidar scanner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 12 Figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MindDiffuser: Controlled Image Reconstruction from Human Brain Activity
+  with Semantic and Structural Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhuo Lu, Changde Du, Qiongyi zhou, Dianpeng Wang, Huiguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing visual stimuli from brain recordings has been a meaningful and
+challenging task. Especially, the achievement of precise and controllable image
+reconstruction bears great significance in propelling the progress and
+utilization of brain-computer interfaces. Despite the advancements in complex
+image reconstruction techniques, the challenge persists in achieving a cohesive
+alignment of both semantic (concepts and objects) and structure (position,
+orientation, and size) with the image stimuli. To address the aforementioned
+issue, we propose a two-stage image reconstruction model called MindDiffuser.
+In Stage 1, the VQ-VAE latent representations and the CLIP text embeddings
+decoded from fMRI are put into Stable Diffusion, which yields a preliminary
+image that contains semantic information. In Stage 2, we utilize the CLIP
+visual feature decoded from fMRI as supervisory information, and continually
+adjust the two feature vectors decoded in Stage 1 through backpropagation to
+align the structural information. The results of both qualitative and
+quantitative analyses demonstrate that our model has surpassed the current
+state-of-the-art models on Natural Scenes Dataset (NSD). The subsequent
+experimental findings corroborate the neurobiological plausibility of the
+model, as evidenced by the interpretability of the multimodal feature employed,
+which align with the corresponding brain responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2303.14139</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AICSD: Adaptive Inter-Class Similarity Distillation for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir M. Mansourian, Rozhan Ahmadi, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks have achieved remarkable accuracy in
+computer vision tasks. With inference time being a crucial factor, particularly
+in dense prediction tasks such as semantic segmentation, knowledge distillation
+has emerged as a successful technique for improving the accuracy of lightweight
+student networks. The existing methods often neglect the information in
+channels and among different classes. To overcome these limitations, this paper
+proposes a novel method called Inter-Class Similarity Distillation (ICSD) for
+the purpose of knowledge distillation. The proposed method transfers high-order
+relations from the teacher network to the student network by independently
+computing intra-class distributions for each class from network outputs. This
+is followed by calculating inter-class similarity matrices for distillation
+using KL divergence between distributions of each pair of classes. To further
+improve the effectiveness of the proposed method, an Adaptive Loss Weighting
+(ALW) training strategy is proposed. Unlike existing methods, the ALW strategy
+gradually reduces the influence of the teacher network towards the end of
+training process to account for errors in teacher's predictions. Extensive
+experiments conducted on two well-known datasets for semantic segmentation,
+Cityscapes and Pascal VOC 2012, validate the effectiveness of the proposed
+method in terms of mIoU and pixel accuracy. The proposed method outperforms
+most of existing knowledge distillation methods as demonstrated by both
+quantitative and qualitative evaluations. Code is available at:
+https://github.com/AmirMansurian/AICSD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Image-to-Image Translation Using GANs for
+  Synthetic Child Race Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yao, Muhammad Ali Farooq, Joseph Lemley, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of ethnic diversity in data has been a limiting factor of face
+recognition techniques in the literature. This is particularly the case for
+children where data samples are scarce and presents a challenge when seeking to
+adapt machine vision algorithms that are trained on adult data to work on
+children. This work proposes the utilization of image-to-image transformation
+to synthesize data of different races and thus adjust the ethnicity of
+children's face data. We consider ethnicity as a style and compare three
+different Image-to-Image neural network based methods, specifically pix2pix,
+CycleGAN, and CUT networks to implement Caucasian child data and Asian child
+data conversion. Experimental validation results on synthetic data demonstrate
+the feasibility of using image-to-image transformation methods to generate
+various synthetic child data samples with broader ethnic diversity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Will your Doorbell Camera still recognize you as you grow old 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yao, Muhammad Ali Farooq, Joseph Lemley, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust authentication for low-power consumer devices such as doorbell cameras
+poses a valuable and unique challenge. This work explores the effect of age and
+aging on the performance of facial authentication methods. Two public age
+datasets, AgeDB and Morph-II have been used as baselines in this work. A
+photo-realistic age transformation method has been employed to augment a set of
+high-quality facial images with various age effects. Then the effect of these
+synthetic aging data on the high-performance deep-learning-based face
+recognition model is quantified by using various metrics including Receiver
+Operating Characteristic (ROC) curves and match score distributions.
+Experimental results demonstrate that long-term age effects are still a
+significant challenge for the state-of-the-art facial authentication method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AquaSAM: Underwater Image Foreground Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muduo Xu, Jianhao Su, Yutao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) has revolutionized natural image
+segmentation, nevertheless, its performance on underwater images is still
+restricted. This work presents AquaSAM, the first attempt to extend the success
+of SAM on underwater images with the purpose of creating a versatile method for
+the segmentation of various underwater targets. To achieve this, we begin by
+classifying and extracting various labels automatically in SUIM dataset.
+Subsequently, we develop a straightforward fine-tuning method to adapt SAM to
+general foreground underwater image segmentation. Through extensive experiments
+involving eight segmentation tasks like human divers, we demonstrate that
+AquaSAM outperforms the default SAM model especially at hard tasks like coral
+reefs. AquaSAM achieves an average Dice Similarity Coefficient (DSC) of 7.13
+(%) improvement and an average of 8.27 (%) on mIoU improvement in underwater
+segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust retrieval of material chemical states in X-ray microspectroscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Wang, Xiaotong Wu, Jizhou Li, Chao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray microspectroscopic techniques are essential for studying morphological
+and chemical changes in materials, providing high-resolution structural and
+spectroscopic information. However, its practical data analysis for reliably
+retrieving the chemical states remains a major obstacle to accelerating the
+fundamental understanding of materials in many research fields. In this work,
+we propose a novel data formulation model for X-ray microspectroscopy and
+develop a dedicated unmixing framework to solve this problem, which is robust
+to noise and spectral variability. Moreover, this framework is not limited to
+the analysis of two-state material chemistry, making it an effective
+alternative to conventional and widely-used methods. In addition, an
+alternative directional multiplier method with provable convergence is applied
+to obtain the solution efficiently. Our framework can accurately identify and
+characterize chemical states in complex and heterogeneous samples, even under
+challenging conditions such as low signal-to-noise ratios and overlapping
+spectral features. Extensive experimental results on simulated and real
+datasets demonstrate its effectiveness and reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring <span class="highlight-title">Transformer</span>s for Open-world Instance Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiannan Wu, Yi Jiang, Bin Yan, Huchuan Lu, Zehuan Yuan, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-world instance segmentation is a rising task, which aims to segment all
+objects in the image by learning from a limited number of base-category
+objects. This task is challenging, as the number of unseen categories could be
+hundreds of times larger than that of seen categories. Recently, the DETR-like
+models have been extensively studied in the closed world while stay unexplored
+in the open world. In this paper, we utilize the Transformer for open-world
+instance segmentation and present SWORD. Firstly, we introduce to attach the
+stop-gradient operation before classification head and further add IoU heads
+for discovering novel objects. We demonstrate that a simple stop-gradient
+operation not only prevents the novel objects from being suppressed as
+background, but also allows the network to enjoy the merit of heuristic label
+assignment. Secondly, we propose a novel contrastive learning framework to
+enlarge the representations between objects and background. Specifically, we
+maintain a universal object queue to obtain the object center, and dynamically
+select positive and negative samples from the object queries for contrastive
+learning. While the previous works only focus on pursuing average recall and
+neglect average precision, we show the prominence of SWORD by giving
+consideration to both criteria. Our models achieve state-of-the-art performance
+in various open-world cross-category and cross-dataset generalizations.
+Particularly, in VOC to non-VOC setup, our method sets new state-of-the-art
+results of 40.0% on ARb100 and 34.9% on ARm100. For COCO to UVO generalization,
+SWORD significantly outperforms the previous best open-world model by 5.9% on
+APm and 8.1% on ARm100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. 16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with
+  Glance Annotation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjun Li, Xiujun Shu, Sunan He, Ruizhi Qiao, Wei Wen, Taian Guo, Bei Gan, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal sentence grounding (TSG) aims to locate a specific moment from an
+untrimmed video with a given natural language query. Recently, weakly
+supervised methods still have a large performance gap compared to fully
+supervised ones, while the latter requires laborious timestamp annotations. In
+this study, we aim to reduce the annotation cost yet keep competitive
+performance for TSG task compared to fully supervised ones. To achieve this
+goal, we investigate a recently proposed glance-supervised temporal sentence
+grounding task, which requires only single frame annotation (referred to as
+glance annotation) for each query. Under this setup, we propose a Dynamic
+Gaussian prior based Grounding framework with Glance annotation (D3G), which
+consists of a Semantic Alignment Group Contrastive Learning module (SA-GCL) and
+a Dynamic Gaussian prior Adjustment module (DGA). Specifically, SA-GCL samples
+reliable positive moments from a 2D temporal map via jointly leveraging
+Gaussian prior and semantic consistency, which contributes to aligning the
+positive sentence-moment pairs in the joint embedding space. Moreover, to
+alleviate the annotation bias resulting from glance annotation and model
+complex queries consisting of multiple events, we propose the DGA module, which
+adjusts the distribution dynamically to approximate the ground truth of target
+moments. Extensive experiments on three challenging benchmarks verify the
+effectiveness of the proposed D3G. It outperforms the state-of-the-art weakly
+supervised methods by a large margin and narrows the performance gap compared
+to fully supervised methods. Code is available at
+https://github.com/solicucu/D3G.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Copy-Move Forgery Detection via Deep Cross-Scale PatchMatch <span class="chip">ICME2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie He, Yuanman Li, Changsheng Chen, Xia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently developed deep algorithms achieve promising progress in the
+field of image copy-move forgery detection (CMFD). However, they have limited
+generalizability in some practical scenarios, where the copy-move objects may
+not appear in the training images or cloned regions are from the background. To
+address the above issues, in this work, we propose a novel end-to-end CMFD
+framework by integrating merits from both conventional and deep methods.
+Specifically, we design a deep cross-scale patchmatch method tailored for CMFD
+to localize copy-move regions. In contrast to existing deep models, our scheme
+aims to seek explicit and reliable point-to-point matching between source and
+target regions using features extracted from high-resolution scales. Further,
+we develop a manipulation region location branch for source/target separation.
+The proposed CMFD framework is completely differentiable and can be trained in
+an end-to-end manner. Extensive experimental results demonstrate the high
+generalizability of our method to different copy-move contents, and the
+proposed scheme achieves significantly better performance than existing
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, accepted by ICME2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Generalizable are Deepfake Detectors? An Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boquan Li, Jun Sun, Christopher M. Poskitt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake videos and images are becoming increasingly credible, posing a
+significant threat given their potential to facilitate fraud or bypass access
+control systems. This has motivated the development of deepfake detection
+methods, in which deep learning models are trained to distinguish between real
+and synthesized footage. Unfortunately, existing detection models struggle to
+generalize to deepfakes from datasets they were not trained on, but little work
+has been done to examine why or how this limitation can be addressed. In this
+paper, we present the first empirical study on the generalizability of deepfake
+detectors, an essential goal for detectors to stay one step ahead of attackers.
+Our study utilizes six deepfake datasets, five deepfake detection methods, and
+two model augmentation approaches, confirming that detectors do not generalize
+in zero-shot settings. Additionally, we find that detectors are learning
+unwanted properties specific to synthesis methods and struggling to extract
+discriminative features, limiting their ability to generalize. Finally, we find
+that there are neurons universally contributing to detection across seen and
+unseen datasets, illuminating a possible path forward to zero-shot
+generalizability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EFaR 2023: Efficient Face Recognition Competition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Niklas Kolf, Fadi Boutros, Jurek Elliesen, Markus Theuerkauf, Naser Damer, Mohamad Alansari, Oussama Abdul Hay, Sara Alansari, Sajid Javed, Naoufel Werghi, Klemen Grm, Vitomir Štruc, Fernando Alonso-Fernandez, Kevin Hernandez Diaz, Josef Bigun, Anjith George, Christophe Ecabert, Hatef Otroshi Shahreza, Ketan Kotwal, Sébastien Marcel, Iurii Medvedev, Bo Jin, Diogo Nunes, Ahmad Hassanpour, Pankaj Khatiwada, Aafan Ahmad Toor, Bian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the summary of the Efficient Face Recognition Competition
+(EFaR) held at the 2023 International Joint Conference on Biometrics (IJCB
+2023). The competition received 17 submissions from 6 different teams. To drive
+further development of efficient face recognition models, the submitted
+solutions are ranked based on a weighted score of the achieved verification
+accuracies on a diverse set of benchmarks, as well as the deployability given
+by the number of floating-point operations and model size. The evaluation of
+submissions is extended to bias, cross-quality, and large-scale recognition
+benchmarks. Overall, the paper gives an overview of the achieved performance
+values of the submitted solutions as well as a diverse set of baselines. The
+submitted solutions use small, efficient network architectures to reduce the
+computational cost, some solutions apply model quantization. An outlook on
+possible techniques that are underrepresented in current solutions is given as
+well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Under-Display Camera Image Restoration with Scattering Effect <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binbin Song, Xiangyu Chen, Shuning Xu, Jiantao Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The under-display camera (UDC) provides consumers with a full-screen visual
+experience without any obstruction due to notches or punched holes. However,
+the semi-transparent nature of the display inevitably introduces the severe
+degradation into UDC images. In this work, we address the UDC image restoration
+problem with the specific consideration of the scattering effect caused by the
+display. We explicitly model the scattering effect by treating the display as a
+piece of homogeneous scattering medium. With the physical model of the
+scattering effect, we improve the image formation pipeline for the image
+synthesis to construct a realistic UDC dataset with ground truths. To suppress
+the scattering effect for the eventual UDC image recovery, a two-branch
+restoration network is designed. More specifically, the scattering branch
+leverages global modeling capabilities of the channel-wise self-attention to
+estimate parameters of the scattering effect from degraded images. While the
+image branch exploits the local representation advantage of CNN to recover
+clear scenes, implicitly guided by the scattering branch. Extensive experiments
+are conducted on both real-world and synthesized data, demonstrating the
+superiority of the proposed method over the state-of-the-art UDC restoration
+techniques. The source code and dataset are available at
+\url{https://github.com/NamecantbeNULL/SRUDC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EPCFormer: Expression <span class="highlight-title">Prompt</span> Collaboration <span class="highlight-title">Transformer</span> for Universal
+  Referring Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Chen, Jiacheng Lin, Zhiqiang Xiao, Haolong Fu, Ke Nai, Kailun Yang, Zhiyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object
+Segmentation (R-VOS) are two highly-related tasks, which both aim to segment
+specific objects from video sequences according to user-provided expression
+prompts. However, due to the challenges in modeling representations for
+different modalities, contemporary methods struggle to strike a balance between
+interaction flexibility and high-precision localization and segmentation. In
+this paper, we address this problem from two perspectives: the alignment
+representation of audio and text and the deep interaction among audio, text,
+and visual features. First, we propose a universal architecture, the Expression
+Prompt Collaboration Transformer, herein EPCFormer. Next, we propose an
+Expression Alignment (EA) mechanism for audio and text expressions. By
+introducing contrastive learning for audio and text expressions, the proposed
+EPCFormer realizes comprehension of the semantic equivalence between audio and
+text expressions denoting the same objects. Then, to facilitate deep
+interactions among audio, text, and video features, we introduce an
+Expression-Visual Attention (EVA) mechanism. The knowledge of video object
+segmentation in terms of the expression prompts can seamlessly transfer between
+the two tasks by deeply exploring complementary cues between text and audio.
+Experiments on well-recognized benchmarks demonstrate that our universal
+EPCFormer attains state-of-the-art results on both tasks. The source code of
+EPCFormer will be made publicly available at
+https://github.com/lab206/EPCFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code will be made publicly available at
+  https://github.com/lab206/EPCFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huilin Zhang, Sumei Li, Yongli Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereoscopic image quality assessment (SIQA) plays a crucial role in
+evaluating and improving the visual experience of 3D content. Existing
+binocular properties and attention-based methods for SIQA have achieved
+promising performance. However, these bottom-up approaches are inadequate in
+exploiting the inherent characteristics of the human visual system (HVS). This
+paper presents a novel network for SIQA via stereo attention, employing a
+top-down perspective to guide the quality assessment process. Our proposed
+method realizes the guidance from high-level binocular signals down to
+low-level monocular signals, while the binocular and monocular information can
+be calibrated progressively throughout the processing pipeline. We design a
+generalized Stereo AttenTion (SAT) block to implement the top-down philosophy
+in stereo perception. This block utilizes the fusion-generated attention map as
+a high-level binocular modulator, influencing the representation of two
+low-level monocular features. Additionally, we introduce an Energy Coefficient
+(EC) to account for recent findings indicating that binocular responses in the
+primate primary visual cortex are less than the sum of monocular responses. The
+adaptive EC can tune the magnitude of binocular response flexibly, thus
+enhancing the formation of robust binocular features within our framework. To
+extract the most discriminative quality information from the summation and
+subtraction of the two branches of monocular features, we utilize a
+dual-pooling strategy that applies min-pooling and max-pooling operations to
+the respective branches. Experimental results highlight the superiority of our
+top-down method in simulating the property of visual perception and advancing
+the state-of-the-art in the SIQA field. The code of this work is available at
+https://github.com/Fanning-Zhang/SATNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Vision-Language Models to Follow Interleaved Vision-Language
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, Yueting Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently sparked significant
+interest, which demonstrates emergent capabilities to serve as a
+general-purpose model for various vision-language tasks. However, existing
+methods mainly focus on limited types of instructions with a single image as
+visual context, which hinders the widespread availability of MLLMs. In this
+paper, we introduce the I4 benchmark to comprehensively evaluate the
+instruction following ability on complicated interleaved vision-language
+instructions, which involve intricate image-text sequential context, covering a
+diverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture
+slides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a
+common defect of existing methods: the Visual Prompt Generator (VPG) trained on
+image-captioning alignment objective tends to attend to common foreground
+information for captioning but struggles to extract specific information
+required by particular tasks. To address this issue, we propose a generic and
+lightweight controllable knowledge re-injection module, which utilizes the
+sophisticated reasoning ability of LLMs to control the VPG to conditionally
+extract instruction-specific visual information and re-inject it into the LLM.
+Further, we introduce an annotation-free cross-attention guided counterfactual
+image training strategy to methodically learn the proposed module by
+collaborating a cascade of foundation models. Enhanced by the proposed module
+and training strategy, we present Cheetah, a MLLM that can effectively handle a
+wide variety of interleaved vision-language instructions and achieves
+state-of-the-art zero-shot performance across all tasks of I4, without
+high-quality multimodal instruction tuning data. Moreover, Cheetah also
+exhibits competitive performance compared with state-of-the-art instruction
+tuned models on concurrent MME benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application for White Spot Syndrome Virus (WSSV) Monitoring using Edge
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo S. Querol, Macario O. Cordel II, Dan Jeric A. Rustia, Mary Nia M. Santos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aquaculture industry, strongly reliant on shrimp exports, faces
+challenges due to viral infections like the White Spot Syndrome Virus (WSSV)
+that severely impact output yields. In this context, computer vision can play a
+significant role in identifying features not immediately evident to skilled or
+untrained eyes, potentially reducing the time required to report WSSV
+infections. In this study, the challenge of limited data for WSSV recognition
+was addressed. A mobile application dedicated to data collection and monitoring
+was developed to facilitate the creation of an image dataset to train a WSSV
+recognition model and improve country-wide disease surveillance. The study also
+includes a thorough analysis of WSSV recognition to address the challenge of
+imbalanced learning and on-device inference. The models explored,
+MobileNetV3-Small and EfficientNetV2-B0, gained an F1-Score of 0.72 and 0.99
+respectively. The saliency heatmaps of both models were also observed to
+uncover the "black-box" nature of these models and to gain insight as to what
+features in the images are most important in making a prediction. These results
+highlight the effectiveness and limitations of using models designed for
+resource-constrained devices and balancing their performance in accurately
+recognizing WSSV, providing valuable information and direction in the use of
+computer vision in this domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-level Structural Relation Modelling and Smoothing for Visual
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zitan Chen, Zhuang Qi, Xiao Cao, Xiangxian Li, Xiangxu Meng, Lei Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning for images has been advanced by recent progress in
+more complex neural models such as the Vision Transformers and new learning
+theories such as the structural causal models. However, these models mainly
+rely on the classification loss to implicitly regularize the class-level data
+distributions, and they may face difficulties when handling classes with
+diverse visual patterns. We argue that the incorporation of the structural
+information between data samples may improve this situation. To achieve this
+goal, this paper presents a framework termed \textbf{C}lass-level Structural
+Relation Modeling and Smoothing for Visual Representation Learning (CSRMS),
+which includes the Class-level Relation Modelling, Class-aware Graph Sampling,
+and Relational Graph-Guided Representation Learning modules to model a
+relational graph of the entire dataset and perform class-aware smoothing and
+regularization operations to alleviate the issue of intra-class visual
+diversity and inter-class similarity. Specifically, the Class-level Relation
+Modelling module uses a clustering algorithm to learn the data distributions in
+the feature space and identify three types of class-level sample relations for
+the training set; Class-aware Graph Sampling module extends typical training
+batch construction process with three strategies to sample dataset-level
+sub-graphs; and Relational Graph-Guided Representation Learning module employs
+a graph convolution network with knowledge-guided smoothing operations to ease
+the projection from different visual patterns to the same class. Experiments
+demonstrate the effectiveness of structured knowledge modelling for enhanced
+representation learning and show that CSRMS can be incorporated with any
+state-of-the-art visual representation learning models for performance gains.
+The source codes and demos have been released at
+https://github.com/czt117/CSRMS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Assessment of the Performance of Deep Learning Classifiers
+  Reveals a Surprising Lack of Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable and robust evaluation methods are a necessary first step towards
+developing machine learning models that are themselves robust and reliable.
+Unfortunately, current evaluation protocols typically used to assess
+classifiers fail to comprehensively evaluate performance as they tend to rely
+on limited types of test data, and ignore others. For example, using the
+standard test data fails to evaluate the predictions made by the classifier to
+samples from classes it was not trained on. On the other hand, testing with
+data containing samples from unknown classes fails to evaluate how well the
+classifier can predict the labels for known classes. This article advocates
+bench-marking performance using a wide range of different types of data and
+using a single metric that can be applied to all such data types to produce a
+consistent evaluation of performance. Using such a benchmark it is found that
+current deep neural networks, including those trained with methods that are
+believed to produce state-of-the-art robustness, are extremely vulnerable to
+making mistakes on certain types of data. This means that such models will be
+unreliable in real-world scenarios where they may encounter data from many
+different domains, and that they are insecure as they can easily be fooled into
+making the wrong decisions. It is hoped that these results will motivate the
+wider adoption of more comprehensive testing methods that will, in turn, lead
+to the development of more robust machine learning methods in the future.
+  Code is available at:
+\url{https://codeberg.org/mwspratling/RobustnessEvaluation}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Color Recommendation in Vector Graphic Documents <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianru Qiu, Xueting Wang, Mayu Otani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color selection plays a critical role in graphic document design and requires
+sufficient consideration of various contexts. However, recommending appropriate
+colors which harmonize with the other colors and textual contexts in documents
+is a challenging task, even for experienced designers. In this study, we
+propose a multimodal masked color model that integrates both color and textual
+contexts to provide text-aware color recommendation for graphic documents. Our
+proposed model comprises self-attention networks to capture the relationships
+between colors in multiple palettes, and cross-attention networks that
+incorporate both color and CLIP-based text representations. Our proposed method
+primarily focuses on color palette completion, which recommends colors based on
+the given colors and text. Additionally, it is applicable for another color
+recommendation task, full palette generation, which generates a complete color
+palette corresponding to the given text. Experimental results demonstrate that
+our proposed approach surpasses previous color palette completion methods on
+accuracy, color distribution, and user experience, as well as full palette
+generation methods concerning color diversity and similarity to the ground
+truth palettes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Unimodal to Multimodal: improving the sEMG-Based Pattern
+  Recognition via deep generative models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Wei, Linyan Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal hand gesture recognition (HGR) systems can achieve higher
+recognition accuracy. However, acquiring multimodal gesture recognition data
+typically requires users to wear additional sensors, thereby increasing
+hardware costs. This paper proposes a novel generative approach to improve
+Surface Electromyography (sEMG)-based HGR accuracy via virtual Inertial
+Measurement Unit (IMU) signals. Specifically, we trained a deep generative
+model based on the intrinsic correlation between forearm sEMG signals and
+forearm IMU signals to generate virtual forearm IMU signals from the input
+forearm sEMG signals at first. Subsequently, the sEMG signals and virtual IMU
+signals were fed into a multimodal Convolutional Neural Network (CNN) model for
+gesture recognition. To evaluate the performance of the proposed approach, we
+conducted experiments on 6 databases, including 5 publicly available databases
+and our collected database comprising 28 subjects performing 38 gestures,
+containing both sEMG and IMU data. The results show that our proposed approach
+outperforms the sEMG-based unimodal HGR method (with increases of
+2.15%-13.10%). It demonstrates that incorporating virtual IMU signals,
+generated by deep generative models, can significantly enhance the accuracy of
+sEMG-based HGR. The proposed approach represents a successful attempt to
+transition from unimodal HGR to multimodal HGR without additional sensor
+hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Gaussian Splatting for Real-Time Radiance Field Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernhard Kerbl, Georgios Kopanas, Thomas Leimkühler, George Drettakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiance Field methods have recently revolutionized novel-view synthesis of
+scenes captured with multiple photos or videos. However, achieving high visual
+quality still requires neural networks that are costly to train and render,
+while recent faster methods inevitably trade off speed for quality. For
+unbounded and complete scenes (rather than isolated objects) and 1080p
+resolution rendering, no current method can achieve real-time display rates. We
+introduce three key elements that allow us to achieve state-of-the-art visual
+quality while maintaining competitive training times and importantly allow
+high-quality real-time (>= 30 fps) novel-view synthesis at 1080p resolution.
+First, starting from sparse points produced during camera calibration, we
+represent the scene with 3D Gaussians that preserve desirable properties of
+continuous volumetric radiance fields for scene optimization while avoiding
+unnecessary computation in empty space; Second, we perform interleaved
+optimization/density control of the 3D Gaussians, notably optimizing
+anisotropic covariance to achieve an accurate representation of the scene;
+Third, we develop a fast visibility-aware rendering algorithm that supports
+anisotropic splatting and both accelerates training and allows realtime
+rendering. We demonstrate state-of-the-art visual quality and real-time
+rendering on several established datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Spatial-Temporal Context for Interacting Hand Reconstruction
+  on Monocular RGB Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichao Zhao, Hezhen Hu, Wengang Zhou, Li li, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing interacting hands from monocular RGB data is a challenging
+task, as it involves many interfering factors, e.g. self- and mutual occlusion
+and similar textures. Previous works only leverage information from a single
+RGB image without modeling their physically plausible relation, which leads to
+inferior reconstruction results. In this work, we are dedicated to explicitly
+exploiting spatial-temporal information to achieve better interacting hand
+reconstruction. On one hand, we leverage temporal context to complement
+insufficient information provided by the single frame, and design a novel
+temporal framework with a temporal constraint for interacting hand motion
+smoothness. On the other hand, we further propose an interpenetration detection
+module to produce kinetically plausible interacting hands without physical
+collisions. Extensive experiments are performed to validate the effectiveness
+of our proposed framework, which achieves new state-of-the-art performance on
+public benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConDistFL: Conditional Distillation for Federated Learning from
+  Partially Annotated Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pochuan Wang, Chen Shen, Weichung Wang, Masahiro Oda, Chiou-Shann Fuh, Kensaku Mori, Holger R. Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a generalized segmentation model capable of simultaneously
+delineating multiple organs and diseases is highly desirable. Federated
+learning (FL) is a key technology enabling the collaborative development of a
+model without exchanging training data. However, the limited access to fully
+annotated training data poses a major challenge to training generalizable
+models. We propose "ConDistFL", a framework to solve this problem by combining
+FL with knowledge distillation. Local models can extract the knowledge of
+unlabeled organs and tumors from partially annotated data from the global model
+with an adequately designed conditional probability representation. We validate
+our framework on four distinct partially annotated abdominal CT datasets from
+the MSD and KiTS19 challenges. The experimental results show that the proposed
+framework significantly outperforms FedAvg and FedOpt baselines. Moreover, the
+performance on an external test dataset demonstrates superior generalizability
+compared to models trained on each dataset separately. Our ablation study
+suggests that ConDistFL can perform well without frequent aggregation, reducing
+the communication cost of FL. Our implementation will be available at
+https://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Analysis of Range for 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neehar Peri, Mengtian Li, Benjamin Wilson, Yu-Xiong Wang, James Hays, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR-based 3D detection plays a vital role in autonomous navigation.
+Surprisingly, although autonomous vehicles (AVs) must detect both near-field
+objects (for collision avoidance) and far-field objects (for longer-term
+planning), contemporary benchmarks focus only on near-field 3D detection.
+However, AVs must detect far-field objects for safe navigation. In this paper,
+we present an empirical analysis of far-field 3D detection using the long-range
+detection dataset Argoverse 2.0 to better understand the problem, and share the
+following insight: near-field LiDAR measurements are dense and optimally
+encoded by small voxels, while far-field measurements are sparse and are better
+encoded with large voxels. We exploit this observation to build a collection of
+range experts tuned for near-vs-far field detection, and propose simple
+techniques to efficiently ensemble models for long-range detection that improve
+efficiency by 33% and boost accuracy by 3.2% CDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023 Workshop - Robustness and Reliability of
+  Autonomous Vehicles in the Open-World</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SODFormer: Streaming Object Detection with <span class="highlight-title">Transformer</span> Using Events and
+  Frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dianze Li, Jianing Li, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DAVIS camera, streaming two complementary sensing modalities of asynchronous
+events and frames, has gradually been used to address major object detection
+challenges (e.g., fast motion blur and low-light). However, how to effectively
+leverage rich temporal cues and fuse two heterogeneous visual streams remains a
+challenging endeavor. To address this challenge, we propose a novel streaming
+object detector with Transformer, namely SODFormer, which first integrates
+events and frames to continuously detect objects in an asynchronous manner.
+Technically, we first build a large-scale multimodal neuromorphic object
+detection dataset (i.e., PKU-DAVIS-SOD) over 1080.1k manual labels. Then, we
+design a spatiotemporal Transformer architecture to detect objects via an
+end-to-end sequence prediction problem, where the novel temporal Transformer
+module leverages rich temporal cues from two visual streams to improve the
+detection performance. Finally, an asynchronous attention-based fusion module
+is proposed to integrate two heterogeneous sensing modalities and take
+complementary advantages from each end, which can be queried at any time to
+locate objects and break through the limited output frequency from synchronized
+frame-based fusion strategies. The results show that the proposed SODFormer
+outperforms four state-of-the-art methods and our eight baselines by a
+significant margin. We also show that our unifying framework works well even in
+cases where the conventional frame-based camera fails, e.g., high-speed motion
+and low-light conditions. Our dataset and code can be available at
+https://github.com/dianzl/SODFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 15 figures, in IEEE Transactions on Pattern Analysis and
+  Machine Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit neural representations for joint decomposition and registration
+  of gene expression images in the marmoset brain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Byra, Charissa Poon, Tomomi Shimogori, Henrik Skibbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel image registration method based on implicit neural
+representations that addresses the challenging problem of registering a pair of
+brain images with similar anatomical structures, but where one image contains
+additional features or artifacts that are not present in the other image. To
+demonstrate its effectiveness, we use 2D microscopy $\textit{in situ}$
+hybridization gene expression images of the marmoset brain. Accurately
+quantifying gene expression requires image registration to a brain template,
+which is difficult due to the diversity of patterns causing variations in
+visible anatomical brain structures. Our approach uses implicit networks in
+combination with an image exclusion loss to jointly perform the registration
+and decompose the image into a support and residual image. The support image
+aligns well with the template, while the residual image captures individual
+image characteristics that diverge from the template. In experiments, our
+method provided excellent results and outperformed other registration
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Augmentation with Large-scale Unconditional <span class="highlight-title">Pre-train</span>ing <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarong Ye, Haomiao Ni, Peng Jin, Sharon X. Huang, Yuan Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning based medical image recognition systems often require a
+substantial amount of training data with expert annotations, which can be
+expensive and time-consuming to obtain. Recently, synthetic augmentation
+techniques have been proposed to mitigate the issue by generating realistic
+images conditioned on class labels. However, the effectiveness of these methods
+heavily depends on the representation capability of the trained generative
+model, which cannot be guaranteed without sufficient labeled training data. To
+further reduce the dependency on annotated data, we propose a synthetic
+augmentation method called HistoDiffusion, which can be pre-trained on
+large-scale unlabeled datasets and later applied to a small-scale labeled
+dataset for augmented training. In particular, we train a latent diffusion
+model (LDM) on diverse unlabeled datasets to learn common features and generate
+realistic images without conditional inputs. Then, we fine-tune the model with
+classifier guidance in latent space on an unseen labeled dataset so that the
+model can synthesize images of specific categories. Additionally, we adopt a
+selective mechanism to only add synthetic samples with high confidence of
+matching to target labels. We evaluate our proposed method by pre-training on
+three histopathology datasets and testing on a histopathology dataset of
+colorectal cancer (CRC) excluded from the pre-training datasets. With
+HistoDiffusion augmentation, the classification accuracy of a backbone
+classifier is remarkably improved by 6.4% using a small set of the original
+labels. Our code is available at https://github.com/karenyyy/HistoDiffAug.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Visual Primitive Experts for Compositional Zero-Shot
+  Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjae Kim, Jiyoung Lee, Seongheon Park, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional zero-shot learning (CZSL) aims to recognize unseen compositions
+with prior knowledge of known primitives (attribute and object). Previous works
+for CZSL often suffer from grasping the contextuality between attribute and
+object, as well as the discriminability of visual features, and the long-tailed
+distribution of real-world compositional data. We propose a simple and scalable
+framework called Composition Transformer (CoT) to address these issues. CoT
+employs object and attribute experts in distinctive manners to generate
+representative embeddings, using the visual network hierarchically. The object
+expert extracts representative object embeddings from the final layer in a
+bottom-up manner, while the attribute expert makes attribute embeddings in a
+top-down manner with a proposed object-guided attention module that models
+contextuality explicitly. To remedy biased prediction caused by imbalanced data
+distribution, we develop a simple minority attribute augmentation (MAA) that
+synthesizes virtual samples by mixing two images and oversampling minority
+attribute classes. Our method achieves SoTA performance on several benchmarks,
+including MIT-States, C-GQA, and VAW-CZSL. We also demonstrate the
+effectiveness of CoT in improving visual discrimination and addressing the
+model bias from the imbalanced data distribution. The code is available at
+https://github.com/HanjaeKim98/CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-to-Fine: Learning Compact Discriminative Representation for
+  Single-Stage Image Retrieval <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunquan Zhu, Xinkai Gao, Bo Ke, Ruizhi Qiao, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image retrieval targets to find images from a database that are visually
+similar to the query image. Two-stage methods following retrieve-and-rerank
+paradigm have achieved excellent performance, but their separate local and
+global modules are inefficient to real-world applications. To better trade-off
+retrieval efficiency and accuracy, some approaches fuse global and local
+feature into a joint representation to perform single-stage image retrieval.
+However, they are still challenging due to various situations to tackle,
+$e.g.$, background, occlusion and viewpoint. In this work, we design a
+Coarse-to-Fine framework to learn Compact Discriminative representation (CFCD)
+for end-to-end single-stage image retrieval-requiring only image-level labels.
+Specifically, we first design a novel adaptive softmax-based loss which
+dynamically tunes its scale and margin within each mini-batch and increases
+them progressively to strengthen supervision during training and intra-class
+compactness. Furthermore, we propose a mechanism which attentively selects
+prominent local descriptors and infuse fine-grained semantic relations into the
+global representation by a hard negative sampling strategy to optimize
+inter-class distinctiveness at a global scale. Extensive experimental results
+have demonstrated the effectiveness of our method, which achieves
+state-of-the-art single-stage image retrieval performance on benchmarks such as
+Revisited Oxford and Revisited Paris. Code is available at
+https://github.com/bassyess/CFCD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot medical image classification with simple shape and texture text
+  descriptors using vision-language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Byra, Muhammad Febrian Rachmadi, Henrik Skibbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the usefulness of vision-language models (VLMs)
+and large language models for binary few-shot classification of medical images.
+We utilize the GPT-4 model to generate text descriptors that encapsulate the
+shape and texture characteristics of objects in medical images. Subsequently,
+these GPT-4 generated descriptors, alongside VLMs pre-trained on natural
+images, are employed to classify chest X-rays and breast ultrasound images. Our
+results indicate that few-shot classification of medical images using VLMs and
+GPT-4 generated descriptors is a viable approach. However, accurate
+classification requires to exclude certain descriptors from the calculations of
+the classification scores. Moreover, we assess the ability of VLMs to evaluate
+shape features in breast mass ultrasound images. We further investigate the
+degree of variability among the sets of text descriptors produced by GPT-4. Our
+work provides several important insights about the application of VLMs for
+medical image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding CNN Hidden Neuron Activations using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, de-mystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture
+  for Robotic Harvesting in open-field environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan He, Salik Ram Khana, Xin Zhang, Manoj Karkee, Qin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposed a YOLOv5-based custom object detection model to detect
+strawberries in an outdoor environment. The original architecture of the
+YOLOv5s was modified by replacing the C3 module with the C2f module in the
+backbone network, which provided a better feature gradient flow. Secondly, the
+Spatial Pyramid Pooling Fast in the final layer of the backbone network of
+YOLOv5s was combined with Cross Stage Partial Net to improve the generalization
+ability over the strawberry dataset in this study. The proposed architecture
+was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with
+three maturity classes (immature, nearly mature, and mature) was collected in
+open-field environment and augmented through a series of operations including
+brightness reduction, brightness increase, and noise adding. To verify the
+superiority of the proposed method for strawberry detection in open-field
+environment, four competitive detection models (YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational
+environment and compared with YOLOv5s-Straw. The results showed that the
+highest mean average precision of 80.3% was achieved using the proposed
+architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.
+Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature
+class, 73.5% in the nearly mature class, and 86.6% in the mature class, which
+were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The
+model included 8.6*10^6 network parameters with an inference speed of 18ms per
+image while the inference speed of YOLOv8s had a slower inference speed of
+21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed
+model is fast enough for real time strawberry detection and localization for
+the robotic picking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages; 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Nie, Yujing Xue, Chunwei Wang, Chaoqiang Ye, Hang Xu, Xinge Zhu, Qingqiu Huang, Michael Bi Mi, Xinchao Wang, Li Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, polar-based representation has shown promising properties in
+perceptual tasks. In addition to Cartesian-based approaches, which separate
+point clouds unevenly, representing point clouds as polar grids has been
+recognized as an alternative due to (1) its advantage in robust performance
+under different resolutions and (2) its superiority in streaming-based
+approaches. However, state-of-the-art polar-based detection methods inevitably
+suffer from the feature distortion problem because of the non-uniform division
+of polar representation, resulting in a non-negligible performance gap compared
+to Cartesian-based approaches. To tackle this issue, we present PARTNER, a
+novel 3D object detector in the polar coordinate. PARTNER alleviates the
+dilemma of feature distortion with global representation re-alignment and
+facilitates the regression by introducing instance-level geometric information
+into the detection head. Extensive experiments show overwhelming advantages in
+streaming-based detection and different resolutions. Furthermore, our method
+outperforms the previous polar-based works with remarkable margins of 3.68% and
+9.15% on Waymo and ONCE validation set, thus achieving competitive results over
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAIF: Perception-Aware Infrared-Visible Image Fusion for Attack-Tolerant
+  Semantic Segmentation <span class="chip">ACM MM'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhu Liu, Jinyuan Liu, Benzhuang Zhang, Long Ma, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared and visible image fusion is a powerful technique that combines
+complementary information from different modalities for downstream semantic
+perception tasks. Existing learning-based methods show remarkable performance,
+but are suffering from the inherent vulnerability of adversarial attacks,
+causing a significant decrease in accuracy. In this work, a perception-aware
+fusion framework is proposed to promote segmentation robustness in adversarial
+scenes. We first conduct systematic analyses about the components of image
+fusion, investigating the correlation with segmentation robustness under
+adversarial perturbations. Based on these analyses, we propose a harmonized
+architecture search with a decomposition-based structure to balance standard
+accuracy and robustness. We also propose an adaptive learning strategy to
+improve the parameter robustness of image fusion, which can learn effective
+feature extraction under diverse adversarial perturbations. Thus, the goals of
+image fusion (\textit{i.e.,} extracting complementary features from source
+modalities and defending attack) can be realized from the perspectives of
+architectural and learning strategies. Extensive experimental results
+demonstrate that our scheme substantially enhances the robustness, with gains
+of 15.3% mIOU of segmentation in the adversarial scene, compared with advanced
+competitors. The source codes are available at
+https://github.com/LiuZhu-CV/PAIF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM'2023;The source codes are available at
+  https://github.com/LiuZhu-CV/PAIF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PUG: Photorealistic and Semantically Controllable Synthetic Data for
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Bordes, Shashank Shekhar, Mark Ibrahim, Diane Bouchacourt, Pascal Vincent, Ari S. Morcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic image datasets offer unmatched advantages for designing and
+evaluating deep neural networks: they make it possible to (i) render as many
+data samples as needed, (ii) precisely control each scene and yield granular
+ground truth labels (and captions), (iii) precisely control distribution shifts
+between training and testing to isolate variables of interest for sound
+experimentation. Despite such promise, the use of synthetic image data is still
+limited -- and often played down -- mainly due to their lack of realism. Most
+works therefore rely on datasets of real images, which have often been scraped
+from public images on the internet, and may have issues with regards to
+privacy, bias, and copyright, while offering little control over how objects
+precisely appear. In this work, we present a path to democratize the use of
+photorealistic synthetic data: we develop a new generation of interactive
+environments for representation learning research, that offer both
+controllability and realism. We use the Unreal Engine, a powerful game engine
+well known in the entertainment industry, to produce PUG (Photorealistic Unreal
+Graphics) environments and datasets for representation learning. In this paper,
+we demonstrate the potential of PUG to enable more rigorous evaluations of
+vision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ed Contrast with Masked Motion Modeling: Towards Versatile 3D
+  Action Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Zhang, Lilang Lin, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning has proved effective for skeleton-based human action
+understanding, which is an important yet challenging topic. Previous works
+mainly rely on contrastive learning or masked motion modeling paradigm to model
+the skeleton relations. However, the sequence-level and joint-level
+representation learning cannot be effectively and simultaneously handled by
+these methods. As a result, the learned representations fail to generalize to
+different downstream tasks. Moreover, combining these two paradigms in a naive
+manner leaves the synergy between them untapped and can lead to interference in
+training. To address these problems, we propose Prompted Contrast with Masked
+Motion Modeling, PCM$^{\rm 3}$, for versatile 3D action representation
+learning. Our method integrates the contrastive learning and masked prediction
+tasks in a mutually beneficial manner, which substantially boosts the
+generalization capacity for various downstream tasks. Specifically, masked
+prediction provides novel training views for contrastive learning, which in
+turn guides the masked prediction training with high-level semantic
+information. Moreover, we propose a dual-prompted multi-task pretraining
+strategy, which further improves model representations by reducing the
+interference caused by learning the two different pretext tasks. Extensive
+experiments on five downstream tasks under three large-scale datasets are
+conducted, demonstrating the superior generalization capacity of PCM$^{\rm 3}$
+compared to the state-of-the-art works. Our project is publicly available at:
+https://jhang2020.github.io/Projects/PCM3/PCM3.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheXFusion: Effective Fusion of Multi-View Features using <span class="highlight-title">Transformer</span>s
+  for Long-Tailed Chest X-Ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongkyun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification poses unique challenges due to the long-tailed
+distribution of diseases, the co-occurrence of diagnostic findings, and the
+multiple views available for each study or patient. This paper introduces our
+solution to the ICCV CVAMD 2023 Shared Task on CXR-LT: Multi-Label Long-Tailed
+Classification on Chest X-Rays. Our approach introduces CheXFusion, a
+transformer-based fusion module incorporating multi-view images. The fusion
+module, guided by self-attention and cross-attention mechanisms, efficiently
+aggregates multi-view features while considering label co-occurrence.
+Furthermore, we explore data balancing and self-training methods to optimize
+the model's performance. Our solution achieves state-of-the-art results with
+0.372 mAP in the MIMIC-CXR test set, securing 1st place in the competition. Our
+success in the task underscores the significance of considering multi-view
+settings, class imbalance, and label co-occurrence in medical image
+classification. Public code is available at
+https://github.com/dongkyuk/CXR-LT-public-solution
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rendering Humans from Object-Occluded Monocular Videos <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiange Xiang, Adam Sun, Jiajun Wu, Ehsan Adeli, Li Fei-Fei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D understanding and rendering of moving humans from monocular videos is a
+challenging task. Despite recent progress, the task remains difficult in
+real-world scenarios, where obstacles may block the camera view and cause
+partial occlusions in the captured videos. Existing methods cannot handle such
+defects due to two reasons. First, the standard rendering strategy relies on
+point-point mapping, which could lead to dramatic disparities between the
+visible and occluded areas of the body. Second, the naive direct regression
+approach does not consider any feasibility criteria (ie, prior information) for
+rendering under occlusions. To tackle the above drawbacks, we present OccNeRF,
+a neural rendering method that achieves better rendering of humans in severely
+occluded scenes. As direct solutions to the two drawbacks, we propose
+surface-based rendering by integrating geometry and visibility priors. We
+validate our method on both simulated and real-world occlusions and demonstrate
+our method's superiority.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page:
+  https://cs.stanford.edu/~xtiange/projects/occnerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSRFlow: Probabilistic Super Resolution with Flow-Based Models for
+  Scientific Data <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Shen, Han-Wei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although many deep-learning-based super-resolution approaches have been
+proposed in recent years, because no ground truth is available in the inference
+stage, few can quantify the errors and uncertainties of the super-resolved
+results. For scientific visualization applications, however, conveying
+uncertainties of the results to scientists is crucial to avoid generating
+misleading or incorrect information. In this paper, we propose PSRFlow, a novel
+normalizing flow-based generative model for scientific data super-resolution
+that incorporates uncertainty quantification into the super-resolution process.
+PSRFlow learns the conditional distribution of the high-resolution data based
+on the low-resolution counterpart. By sampling from a Gaussian latent space
+that captures the missing information in the high-resolution data, one can
+generate different plausible super-resolution outputs. The efficient sampling
+in the Gaussian latent space allows our model to perform uncertainty
+quantification for the super-resolved results. During model training, we
+augment the training data with samples across various scales to make the model
+adaptable to data of different scales, achieving flexible super-resolution for
+a given input. Our results demonstrate superior performance and robust
+uncertainty quantification compared with existing methods such as interpolation
+and GAN-based super-resolution networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proc. IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 1st Place Solution for CVPR2023 BURST Long Tail and Open World
+  Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaer Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently, Video Instance Segmentation (VIS) aims at segmenting and
+categorizing objects in videos from a closed set of training categories that
+contain only a few dozen of categories, lacking the ability to handle diverse
+objects in real-world videos. As TAO and BURST datasets release, we have the
+opportunity to research VIS in long-tailed and open-world scenarios.
+Traditional VIS methods are evaluated on benchmarks limited to a small number
+of common classes, But practical applications require trackers that go beyond
+these common classes, detecting and tracking rare and even never-before-seen
+objects. Inspired by the latest MOT paper for the long tail task (Tracking
+Every Thing in the Wild, Siyuan Li et), for the BURST long tail challenge, we
+train our model on a combination of LVISv0.5 and the COCO dataset using repeat
+factor sampling. First, train the detector with segmentation and CEM on
+LVISv0.5 + COCO dataset. And then, train the instance appearance similarity
+head on the TAO dataset. at last, our method (LeTracker) gets 14.9 HOTAall in
+the BURST test set, ranking 1st in the benchmark. for the open-world
+challenges, we only use 64 classes (Intersection classes of BURST Train subset
+and COCO dataset, without LVIS dataset) annotations data training, and testing
+on BURST test set data and get 61.4 OWTAall, ranking 1st in the benchmark. Our
+code will be released to facilitate future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal DINO: A <span class="highlight-title">Self-supervised</span> Video Strategy to Enhance Action
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Teeti, Rongali Sai Bhargav, Vivek Singh, Andrew Bradley, Biplab Banerjee, Fabio Cuzzolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emerging field of action prediction plays a vital role in various
+computer vision applications such as autonomous driving, activity analysis and
+human-computer interaction. Despite significant advancements, accurately
+predicting future actions remains a challenging problem due to high
+dimensionality, complex dynamics and uncertainties inherent in video data.
+Traditional supervised approaches require large amounts of labelled data, which
+is expensive and time-consuming to obtain. This paper introduces a novel
+self-supervised video strategy for enhancing action prediction inspired by DINO
+(self-distillation with no labels). The Temporal-DINO approach employs two
+models; a 'student' processing past frames; and a 'teacher' processing both
+past and future frames, enabling a broader temporal context. During training,
+the teacher guides the student to learn future context by only observing past
+frames. The strategy is evaluated on ROAD dataset for the action prediction
+downstream task using 3D-ResNet, Transformer, and LSTM architectures. The
+experimental results showcase significant improvements in prediction
+performance across these architectures, with our method achieving an average
+enhancement of 9.9% Precision Points (PP), highlighting its effectiveness in
+enhancing the backbones' capabilities of capturing long-term dependencies.
+Furthermore, our approach demonstrates efficiency regarding the pretraining
+dataset size and the number of epochs required. This method overcomes
+limitations present in other approaches, including considering various backbone
+architectures, addressing multiple prediction horizons, reducing reliance on
+hand-crafted augmentations, and streamlining the pretraining process into a
+single stage. These findings highlight the potential of our approach in diverse
+video-based tasks such as activity recognition, motion planning, and scene
+understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LATR: 3D Lane Detection from Monocular Images with <span class="highlight-title">Transformer</span> <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueru Luo, Chaoda Zheng, Xu Yan, Tang Kun, Chao Zheng, Shuguang Cui, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D lane detection from monocular images is a fundamental yet challenging task
+in autonomous driving. Recent advances primarily rely on structural 3D
+surrogates (e.g., bird's eye view) that are built from front-view image
+features and camera parameters. However, the depth ambiguity in monocular
+images inevitably causes misalignment between the constructed surrogate feature
+map and the original image, posing a great challenge for accurate lane
+detection. To address the above issue, we present a novel LATR model, an
+end-to-end 3D lane detector that uses 3D-aware front-view features without
+transformed view representation. Specifically, LATR detects 3D lanes via
+cross-attention based on query and key-value pairs, constructed using our
+lane-aware query generator and dynamic 3D ground positional embedding. On the
+one hand, each query is generated based on 2D lane-aware features and adopts a
+hybrid embedding to enhance the lane information. On the other hand, 3D space
+information is injected as positional embedding from an iteratively-updated 3D
+ground plane. LATR outperforms previous state-of-the-art methods on both
+synthetic Apollo and realistic OpenLane by large margins (e.g., 11.4 gains in
+terms of F1 score on OpenLane). Code will be released at
+https://github.com/JMoonr/LATR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Algorithms From Pairwise User Preferences <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonid Keselman, Katherine Shih, Martial Hebert, Aaron Steinfeld
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typical black-box optimization approaches in robotics focus on learning from
+metric scores. However, that is not always possible, as not all developers have
+ground truth available. Learning appropriate robot behavior in human-centric
+contexts often requires querying users, who typically cannot provide precise
+metric scores. Existing approaches leverage human feedback in an attempt to
+model an implicit reward function; however, this reward may be difficult or
+impossible to effectively capture. In this work, we introduce SortCMA to
+optimize algorithm parameter configurations in high dimensions based on
+pairwise user preferences. SortCMA efficiently and robustly leverages user
+input to find parameter sets without directly modeling a reward. We apply this
+method to tuning a commercial depth sensor without ground truth, and to robot
+social navigation, which involves highly complex preferences over robot
+behavior. We show that our method succeeds in optimizing for the user's goals
+and perform a user study to evaluate social navigation results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FocalFormer3D : Focusing on Hard Instance for 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Chen, Zhiding Yu, Yukang Chen, Shiyi Lan, Animashree Anandkumar, Jiaya Jia, Jose Alvarez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  False negatives (FN) in 3D object detection, {\em e.g.}, missing predictions
+of pedestrians, vehicles, or other obstacles, can lead to potentially dangerous
+situations in autonomous driving. While being fatal, this issue is understudied
+in many current 3D detection methods. In this work, we propose Hard Instance
+Probing (HIP), a general pipeline that identifies \textit{FN} in a multi-stage
+manner and guides the models to focus on excavating difficult instances. For 3D
+object detection, we instantiate this method as FocalFormer3D, a simple yet
+effective detector that excels at excavating difficult objects and improving
+prediction recall. FocalFormer3D features a multi-stage query generation to
+discover hard objects and a box-level transformer decoder to efficiently
+distinguish objects from massive object candidates. Experimental results on the
+nuScenes and Waymo datasets validate the superior performance of FocalFormer3D.
+The advantage leads to strong performance on both detection and tracking, in
+both LiDAR and multi-modal settings. Notably, FocalFormer3D achieves a 70.5 mAP
+and 73.9 NDS on nuScenes detection benchmark, while the nuScenes tracking
+benchmark shows 72.1 AMOTA, both ranking 1st place on the nuScenes LiDAR
+leaderboard. Our code is available at
+\url{https://github.com/NVlabs/FocalFormer3D}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Fake to Real (FFR): A two-stage training pipeline for mitigating
+  spurious correlations with synthetic data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maan Qraitem, Kate Saenko, Bryan A. Plummer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual recognition models are prone to learning spurious correlations induced
+by an imbalanced training set where certain groups (\eg Females) are
+under-represented in certain classes (\eg Programmers). Generative models offer
+a promising direction in mitigating this bias by generating synthetic data for
+the minority samples and thus balancing the training set. However, prior work
+that uses these approaches overlooks that visual recognition models could often
+learn to differentiate between real and synthetic images and thus fail to
+unlearn the bias in the original dataset. In our work, we propose a novel
+two-stage pipeline to mitigate this issue where 1) we pre-train a model on a
+balanced synthetic dataset and then 2) fine-tune on the real data. Using this
+pipeline, we avoid training on both real and synthetic data, thus avoiding the
+bias between real and synthetic data. Moreover, we learn robust features
+against the bias in the first step that mitigate the bias in the second step.
+Moreover, our pipeline naturally integrates with bias mitigation methods; they
+can be simply applied to the fine-tuning step. As our experiments prove, our
+pipeline can further improve the performance of bias mitigation methods
+obtaining state-of-the-art performance on three large-scale datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Medical Image Classification in Noisy Labels Using Only
+  <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pretrain</span>ing <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bidur Khanal, Binod Bhattarai, Bishesh Khanal, Cristian A. Linte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Noisy labels hurt deep learning-based supervised image classification
+performance as the models may overfit the noise and learn corrupted feature
+extractors. For natural image classification training with noisy labeled data,
+model initialization with contrastive self-supervised pretrained weights has
+shown to reduce feature corruption and improve classification performance.
+However, no works have explored: i) how other self-supervised approaches, such
+as pretext task-based pretraining, impact the learning with noisy label, and
+ii) any self-supervised pretraining methods alone for medical images in noisy
+label settings. Medical images often feature smaller datasets and subtle inter
+class variations, requiring human expertise to ensure correct classification.
+Thus, it is not clear if the methods improving learning with noisy labels in
+natural image datasets such as CIFAR would also help with medical images. In
+this work, we explore contrastive and pretext task-based self-supervised
+pretraining to initialize the weights of a deep learning classification model
+for two medical datasets with self-induced noisy labels -- NCT-CRC-HE-100K
+tissue histological images and COVID-QU-Ex chest X-ray images. Our results show
+that models initialized with pretrained weights obtained from self-supervised
+learning can effectively learn better features and improve robustness against
+noisy labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023 DEMI Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prune Spatio-temporal Tokens by Semantic-aware Temporal Accumulation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangrui Ding, Peisen Zhao, Xiaopeng Zhang, Rui Qian, Hongkai Xiong, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have become the primary backbone of the computer vision
+community due to their impressive performance. However, the unfriendly
+computation cost impedes their potential in the video recognition domain. To
+optimize the speed-accuracy trade-off, we propose Semantic-aware Temporal
+Accumulation score (STA) to prune spatio-temporal tokens integrally. STA score
+considers two critical factors: temporal redundancy and semantic importance.
+The former depicts a specific region based on whether it is a new occurrence or
+a seen entity by aggregating token-to-token similarity in consecutive frames
+while the latter evaluates each token based on its contribution to the overall
+prediction. As a result, tokens with higher scores of STA carry more temporal
+redundancy as well as lower semantics thus being pruned. Based on the STA
+score, we are able to progressively prune the tokens without introducing any
+additional parameters or requiring further re-training. We directly apply the
+STA module to off-the-shelf ViT and VideoSwin backbones, and the empirical
+results on Kinetics-400 and Something-Something V2 achieve over 30% computation
+reduction with a negligible ~0.2% accuracy drop. The code is released at
+https://github.com/Mark12Ding/STA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YUDO: YOLO for Uniform Directed Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Đorđe Nedeljković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an efficient way of detecting directed objects by
+predicting their center coordinates and direction angle. Since the objects are
+of uniform size, the proposed model works without predicting the object's width
+and height. The dataset used for this problem is presented in Honeybee
+Segmentation and Tracking Datasets project. One of the contributions of this
+work is an examination of the ability of the standard real-time object
+detection architecture like YoloV7 to be customized for position and direction
+detection. A very efficient, tiny version of the architecture is used in this
+approach. Moreover, only one of three detection heads without anchors is
+sufficient for this task. We also introduce the extended Skew Intersection over
+Union (SkewIoU) calculation for rotated boxes - directed IoU (DirIoU), which
+includes an absolute angle difference. DirIoU is used both in the matching
+procedure of target and predicted bounding boxes for mAP calculation, and in
+the NMS filtering procedure. The code and models are available at
+https://github.com/djordjened92/yudo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facial Prior Based First Order Motion Model for Micro-expression
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Youjun Zhao, Yuhang Wen, Zixuan Tang, Xinhua Xu, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spotting facial micro-expression from videos finds various potential
+applications in fields including clinical diagnosis and interrogation,
+meanwhile this task is still difficult due to the limited scale of training
+data. To solve this problem, this paper tries to formulate a new task called
+micro-expression generation and then presents a strong baseline which combines
+the first order motion model with facial prior knowledge. Given a target face,
+we intend to drive the face to generate micro-expression videos according to
+the motion patterns of source videos. Specifically, our new model involves
+three modules. First, we extract facial prior features from a region focusing
+module. Second, we estimate facial motion using key points and local affine
+transformations with a motion prediction module. Third, expression generation
+module is used to drive the target face to generate videos. We train our model
+on public CASME II, SAMM and SMIC datasets and then use the model to generate
+new micro-expression videos for evaluation. Our model achieves the first place
+in the Facial Micro-Expression Challenge 2021 (MEGC2021), where our superior
+performance is verified by three experts with Facial Action Coding System
+certification. Source code is provided in
+https://github.com/Necolizer/Facial-Prior-Based-FOMM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation of Human Condition at Disaster Site Using Aerial Drone Images <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomoki Arai, Kenji Iwata, Kensho Hara, Yutaka Satoh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drones are being used to assess the situation in various disasters. In this
+study, we investigate a method to automatically estimate the damage status of
+people based on their actions in aerial drone images in order to understand
+disaster sites faster and save labor. We constructed a new dataset of aerial
+images of human actions in a hypothetical disaster that occurred in an urban
+area, and classified the human damage status using 3D ResNet. The results
+showed that the status with characteristic human actions could be classified
+with a recall rate of more than 80%, while other statuses with similar human
+actions could only be classified with a recall rate of about 50%. In addition,
+a cloud-based VR presentation application suggested the effectiveness of using
+drones to understand the disaster site and estimate the human condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In submission to the ICCV 2023 Artificial Intelligence for
+  Humanitarian Assistance and Disaster Response Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Modern Persian Carpet Map by Style-transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dorsa Rahmatian, Monireh Moshavash, Mahdi Eftekhari, Kamran Hoseinkhani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, the great performance of Deep Neural Networks(DNN) has been proven in
+various fields. One of its most attractive applications is to produce artistic
+designs. A carpet that is known as a piece of art is one of the most important
+items in a house, which has many enthusiasts all over the world. The first
+stage of producing a carpet is to prepare its map, which is a difficult,
+time-consuming, and expensive task. In this research work, our purpose is to
+use DNN for generating a Modern Persian Carpet Map. To reach this aim, three
+different DNN style transfer methods are proposed and compared against each
+other. In the proposed methods, the Style-Swap method is utilized to create the
+initial carpet map, and in the following, to generate more diverse designs,
+methods Clip-Styler, Gatys, and Style-Swap are used separately. In addition,
+some methods are examined and introduced for coloring the produced carpet maps.
+The designed maps are evaluated via the results of filled questionnaires where
+the outcomes of user evaluations confirm the popularity of generated carpet
+maps. Eventually, for the first time, intelligent methods are used in producing
+carpet maps, and it reduces human intervention. The proposed methods can
+successfully produce diverse carpet designs, and at a higher speed than
+traditional ways.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Camouflaged Object Segmentation as Domain Adaptation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Chengyi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning for unsupervised image segmentation remains challenging due to
+the absence of human labels. The common idea is to train a segmentation head,
+with the supervision of pixel-wise pseudo-labels generated based on the
+representation of self-supervised backbones. By doing so, the model performance
+depends much on the distance between the distributions of target datasets and
+the pre-training dataset (e.g., ImageNet). In this work, we investigate a new
+task, namely unsupervised camouflaged object segmentation (UCOS), where the
+target objects own a common rarely-seen attribute, i.e., camouflage.
+Unsurprisingly, we find that the state-of-the-art unsupervised models struggle
+in adapting UCOS, due to the domain gap between the properties of generic and
+camouflaged objects. To this end, we formulate the UCOS as a source-free
+unsupervised domain adaptation task (UCOS-DA), where both source labels and
+target labels are absent during the whole model training process. Specifically,
+we define a source model consisting of self-supervised vision transformers
+pre-trained on ImageNet. On the other hand, the target domain includes a simple
+linear layer (i.e., our target model) and unlabeled camouflaged objects. We
+then design a pipeline for foreground-background-contrastive self-adversarial
+domain adaptation, to achieve robust UCOS. As a result, our baseline model
+achieves superior segmentation performance when compared with competing
+unsupervised models on the UCOS benchmark, with the training set which's scale
+is only one tenth of the supervised COS counterpart.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, 3 tables; Project Page:
+  https://github.com/Jun-Pu/UCOS-DA ; Accepted to ICCV 2023 Workshop on OOD-CV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours
+  Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordão Bragantini, Merlin Lange, Loïc Royer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we describe a method for large-scale 3D cell-tracking through a
+segmentation selection approach. The proposed method is effective at tracking
+cells across large microscopy datasets on two fronts: (i) It can solve problems
+containing millions of segmentation instances in terabyte-scale 3D+t datasets;
+(ii) It achieves competitive results with or without deep learning, which
+requires 3D annotated data, that is scarce in the fluorescence microscopy
+field. The proposed method computes cell tracks and segments using a hierarchy
+of segmentation hypotheses and selects disjoint segments by maximizing the
+overlap between adjacent frames. We show that this method achieves
+state-of-the-art results in 3D images from the cell tracking challenge and has
+a faster integer linear programming formulation. Moreover, our framework is
+flexible and supports segmentations from off-the-shelf cell segmentation models
+and can combine them into an ensemble that improves tracking. The code is
+available https://github.com/royerlab/ultrack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward unlabeled multi-view 3D pedestrian detection by generalizable AI:
+  techniques and performance analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Paulo Lima, Diego Thomas, Hideaki Uchiyama, Veronica Teichrieb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We unveil how generalizable AI can be used to improve multi-view 3D
+pedestrian detection in unlabeled target scenes. One way to increase
+generalization to new scenes is to automatically label target data, which can
+then be used for training a detector model. In this context, we investigate two
+approaches for automatically labeling target data: pseudo-labeling using a
+supervised detector and automatic labeling using an untrained detector (that
+can be applied out of the box without any training). We adopt a training
+framework for optimizing detector models using automatic labeling procedures.
+This framework encompasses different training sets/modes and multi-round
+automatic labeling strategies. We conduct our analyses on the
+publicly-available WILDTRACK and MultiviewX datasets. We show that, by using
+the automatic labeling approach based on an untrained detector, we can obtain
+superior results than directly using the untrained detector or a detector
+trained with an existing labeled source dataset. It achieved a MODA about 4%
+and 1% better than the best existing unlabeled method when using WILDTRACK and
+MultiviewX as target datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIBGRAPI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLOCS: Object Detection based on Dense Channel Compression for Feature
+  Spatial Solidification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04170v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04170v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Huang, Weisheng Li, Linlin Shen, Haojie Fu, Xue Xiao, Suihan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we examine the associations between channel features and
+convolutional kernels during the processes of feature purification and gradient
+backpropagation, with a focus on the forward and backward propagation within
+the network. Consequently, we propose a method called Dense Channel Compression
+for Feature Spatial Solidification. Drawing upon the central concept of this
+method, we introduce two innovative modules for backbone and head networks: the
+Dense Channel Compression for Feature Spatial Solidification Structure (DCFS)
+and the Asymmetric Multi-Level Compression Decoupled Head (ADH). When
+integrated into the YOLOv5 model, these two modules demonstrate exceptional
+performance, resulting in a modified model referred to as YOLOCS. Evaluated on
+the MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of
+50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably
+similar to those of the YOLOv5 model, the large, medium, and small YOLOCS
+models surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Data Attribution for Text-to-Image Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Yu Wang, Alexei A. Efros, Jun-Yan Zhu, Richard Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large text-to-image models are able to synthesize "novel" images, these
+images are necessarily a reflection of the training data. The problem of data
+attribution in such models -- which of the images in the training set are most
+responsible for the appearance of a given generated image -- is a difficult yet
+important one. As an initial step toward this problem, we evaluate attribution
+through "customization" methods, which tune an existing large-scale model
+toward a given exemplar object or style. Our key insight is that this allows us
+to efficiently create synthetic images that are computationally influenced by
+the exemplar by construction. With our new dataset of such exemplar-influenced
+images, we are able to evaluate various data attribution algorithms and
+different possible feature spaces. Furthermore, by training on our dataset, we
+can tune standard models, such as DINO, CLIP, and ViT, toward the attribution
+problem. Even though the procedure is tuned towards small exemplar sets, we
+show generalization to larger sets. Finally, by taking into account the
+inherent uncertainty of the problem, we can assign soft attribution scores over
+a set of training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated v2 -- ICCV 2023 camera ready version. Project page:
+  https://peterwang512.github.io/GenDataAttribution Code:
+  https://github.com/PeterWang512/GenDataAttribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Spectral Denoising <span class="highlight-title">Transformer</span> with Guided Attention <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqiang Lai, Chenggang Yan, Ying Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a Hybrid Spectral Denoising Transformer (HSDT) for
+hyperspectral image denoising. Challenges in adapting transformer for HSI arise
+from the capabilities to tackle existing limitations of CNN-based methods in
+capturing the global and local spatial-spectral correlations while maintaining
+efficiency and flexibility. To address these issues, we introduce a hybrid
+approach that combines the advantages of both models with a Spatial-Spectral
+Separable Convolution (S3Conv), Guided Spectral Self-Attention (GSSA), and
+Self-Modulated Feed-Forward Network (SM-FFN). Our S3Conv works as a lightweight
+alternative to 3D convolution, which extracts more spatial-spectral correlated
+features while keeping the flexibility to tackle HSIs with an arbitrary number
+of bands. These features are then adaptively processed by GSSA which per-forms
+3D self-attention across the spectral bands, guided by a set of learnable
+queries that encode the spectral signatures. This not only enriches our model
+with powerful capabilities for identifying global spectral correlations but
+also maintains linear complexity. Moreover, our SM-FFN proposes the
+self-modulation that intensifies the activations of more informative regions,
+which further strengthens the aggregated features. Extensive experiments are
+conducted on various datasets under both simulated and real-world noise, and it
+shows that our HSDT significantly outperforms the existing state-of-the-art
+methods while maintaining low computational overhead. Code is at https:
+//github.com/Zeqiang-Lai/HSDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitRef: Gait Recognition with Refined Sequential Skeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haidong Zhu, Wanrong Zheng, Zhaoheng Zheng, Ram Nevatia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying humans with their walking sequences, known as gait recognition,
+is a useful biometric understanding task as it can be observed from a long
+distance and does not require cooperation from the subject. Two common
+modalities used for representing the walking sequence of a person are
+silhouettes and joint skeletons. Silhouette sequences, which record the
+boundary of the walking person in each frame, may suffer from the variant
+appearances from carried-on objects and clothes of the person. Framewise joint
+detections are noisy and introduce some jitters that are not consistent with
+sequential detections. In this paper, we combine the silhouettes and skeletons
+and refine the framewise joint predictions for gait recognition. With temporal
+information from the silhouette sequences, we show that the refined skeletons
+can improve gait recognition performance without extra annotations. We compare
+our methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023 oral. Code is available at
+  https://github.com/haidongz-usc/GaitRef</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PMAA: A Progressive Multi-scale Attention Autoencoder Model for
+  High-performance Cloud Removal from Multi-temporal Satellite Imagery <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Pin Tao, Yachao Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery analysis plays a pivotal role in remote sensing; however,
+information loss due to cloud cover significantly impedes its application.
+Although existing deep cloud removal models have achieved notable outcomes,
+they scarcely consider contextual information. This study introduces a
+high-performance cloud removal architecture, termed Progressive Multi-scale
+Attention Autoencoder (PMAA), which concurrently harnesses global and local
+information to construct robust contextual dependencies using a novel
+Multi-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).
+PMAA establishes long-range dependencies of multi-scale features using MAM and
+modulates the reconstruction of fine-grained details utilizing LIM, enabling
+simultaneous representation of fine- and coarse-grained features at the same
+level. With the help of diverse and multi-scale features, PMAA consistently
+outperforms the previous state-of-the-art model CTGAN on two benchmark
+datasets. Moreover, PMAA boasts considerable efficiency advantages, with only
+0.5% and 14.6% of the parameters and computational complexity of CTGAN,
+respectively. These comprehensive results underscore PMAA's potential as a
+lightweight cloud removal network suitable for deployment on edge devices to
+accomplish large-scale cloud removal tasks. Our source code and pre-trained
+models are available at https://github.com/XavierJiezou/PMAA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maoxun Yuan, Tianyi Zhao, Bo Li, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening, as one of the most commonly used techniques in remote sensing
+systems, aims to inject spatial details from panchromatic images into
+multispectral images (MS) to obtain high-resolution multispectral images. Since
+deep learning has received widespread attention because of its powerful fitting
+ability and efficient feature extraction, a variety of pan-sharpening methods
+have been proposed to achieve remarkable performance. However, current
+pan-sharpening methods usually require the paired panchromatic (PAN) and MS
+images as input, which limits their usage in some scenarios. To address this
+issue, in this paper we observe that the spatial details from PAN images are
+mainly high-frequency cues, i.e., the edges reflect the contour of input PAN
+images. This motivates us to develop a PAN-agnostic representation to store
+some base edges, so as to compose the contour for the corresponding PAN image
+via them. As a result, we can perform the pan-sharpening task with only the MS
+image when inference. To this end, a memory-based network is adapted to extract
+and memorize the spatial details during the training phase and is used to
+replace the process of obtaining spatial information from PAN images when
+inference, which is called Memory-based Spatial Details Network (MSDN).
+Finally, we integrate the proposed MSDN module into the existing deep
+learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening
+network. With extensive experiments on the Gaofen1 and WorldView-4 satellites,
+we verify that our method constructs good spatial details without PAN images
+and achieves the best performance. The code is available at
+https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffBFR: Bootstrapping Diffusion Model Towards Blind Face Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinmin Qiu, Congying Han, Zicheng Zhang, Bonan Li, Tiande Guo, Xuecheng Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face restoration (BFR) is important while challenging. Prior works
+prefer to exploit GAN-based frameworks to tackle this task due to the balance
+of quality and efficiency. However, these methods suffer from poor stability
+and adaptability to long-tail distribution, failing to simultaneously retain
+source identity and restore detail. We propose DiffBFR to introduce Diffusion
+Probabilistic Model (DPM) for BFR to tackle the above problem, given its
+superiority over GAN in aspects of avoiding training collapse and generating
+long-tail distribution. DiffBFR utilizes a two-step design, that first restores
+identity information from low-quality images and then enhances texture details
+according to the distribution of real faces. This design is implemented with
+two key components: 1) Identity Restoration Module (IRM) for preserving the
+face details in results. Instead of denoising from pure Gaussian random
+distribution with LQ images as the condition during the reverse process, we
+propose a novel truncated sampling method which starts from LQ images with part
+noise added. We theoretically prove that this change shrinks the evidence lower
+bound of DPM and then restores more original details. With theoretical proof,
+two cascade conditional DPMs with different input sizes are introduced to
+strengthen this sampling effect and reduce training difficulty in the
+high-resolution image generated directly. 2) Texture Enhancement Module (TEM)
+for polishing the texture of the image. Here an unconditional DPM, a LQ-free
+model, is introduced to further force the restorations to appear realistic. We
+theoretically proved that this unconditional DPM trained on pure HQ images
+contributes to justifying the correct distribution of inference images output
+from IRM in pixel-level space. Truncated sampling with fractional time step is
+utilized to polish pixel-level textures while preserving identity information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas David, Helio Pedrini, Zanoni Dias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate the necessity for large amounts of supervised segmentation
+annotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)
+strategies have been devised. These will often rely on advanced data and model
+regularization strategies to instigate the development of useful properties
+(e.g., prediction completeness and fidelity to semantic boundaries) in
+segmentation priors, notwithstanding the lack of annotated information. In this
+work, we first create a strong baseline by analyzing complementary WSSS
+techniques and regularizing strategies, considering their strengths and
+limitations. We then propose a new Class-specific Adversarial Erasing strategy,
+comprising two adversarial CAM generating networks being gradually refined to
+produce robust semantic segmentation proposals. Empirical results suggest that
+our approach induces substantial improvement in the effectiveness of the
+baseline, resulting in a noticeable improvement over both Pascal VOC 2012 and
+MS COCO 2014 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Voting-Stacking Ensemble of Inception Networks for Cervical Cytology
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyi Qian, Qian Huang, Yulin Chen, Junzhou Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cervical cancer is one of the most severe diseases threatening women's
+health. Early detection and diagnosis can significantly reduce cancer risk, in
+which cervical cytology classification is indispensable. Researchers have
+recently designed many networks for automated cervical cancer diagnosis, but
+the limited accuracy and bulky size of these individual models cannot meet
+practical application needs. To address this issue, we propose a
+Voting-Stacking ensemble strategy, which employs three Inception networks as
+base learners and integrates their outputs through a voting ensemble. The
+samples misclassified by the ensemble model generate a new training set on
+which a linear classification model is trained as the meta-learner and performs
+the final predictions. In addition, a multi-level Stacking ensemble framework
+is designed to improve performance further. The method is evaluated on the
+SIPakMed, Herlev, and Mendeley datasets, achieving accuracies of 100%, 100%,
+and 100%, respectively. The experimental results outperform the current
+state-of-the-art (SOTA) methods, demonstrating its potential for reducing
+screening workload and helping pathologists detect cervical cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inherently Interpretable Multi-Label Classification Using Class-Specific
+  Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Stefano Woerner, Andreas Maier, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is essential for machine learning algorithms in high-stakes
+application fields such as medical image analysis. However, high-performing
+black-box neural networks do not provide explanations for their predictions,
+which can lead to mistrust and suboptimal human-ML collaboration. Post-hoc
+explanation techniques, which are widely used in practice, have been shown to
+suffer from severe conceptual problems. Furthermore, as we show in this paper,
+current explanation techniques do not perform adequately in the multi-label
+scenario, in which multiple medical findings may co-occur in a single image. We
+propose Attri-Net, an inherently interpretable model for multi-label
+classification. Attri-Net is a powerful classifier that provides transparent,
+trustworthy, and human-understandable explanations. The model first generates
+class-specific attribution maps based on counterfactuals to identify which
+image regions correspond to certain medical findings. Then a simple logistic
+regression classifier is used to make predictions based solely on these
+attribution maps. We compare Attri-Net to five post-hoc explanation techniques
+and one inherently interpretable classifier on three chest X-ray datasets. We
+find that Attri-Net produces high-quality multi-label explanations consistent
+with clinical knowledge and has comparable classification performance to
+state-of-the-art classification models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MIDL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genie: Show Me the Data for Quantization <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkweon Jeon, Chungman Lee, Ho-young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot quantization is a promising approach for developing lightweight
+deep neural networks when data is inaccessible owing to various reasons,
+including cost and issues related to privacy. By exploiting the learned
+parameters ($\mu$ and $\sigma$) of batch normalization layers in an
+FP32-pre-trained model, zero-shot quantization schemes focus on generating
+synthetic data. Subsequently, they distill knowledge from the pre-trained model
+(teacher) to the quantized model (student) such that the quantized model can be
+optimized with the synthetic dataset. However, thus far, zero-shot quantization
+has primarily been discussed in the context of quantization-aware training
+methods, which require task-specific losses and long-term optimization as much
+as retraining. We thus introduce a post-training quantization scheme for
+zero-shot quantization that produces high-quality quantized networks within a
+few hours. Furthermore, we propose a framework called Genie~that generates data
+suited for quantization. With the data synthesized by Genie, we can produce
+robust quantized models without real datasets, which is comparable to few-shot
+quantization. We also propose a post-training quantization algorithm to enhance
+the performance of quantized models. By combining them, we can bridge the gap
+between zero-shot and few-shot quantization while significantly improving the
+quantization performance compared to that of existing approaches. In other
+words, we can obtain a unique state-of-the-art zero-shot quantization approach.
+The code is available at \url{https://github.com/SamsungLabs/Genie}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09724v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09724v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kibeom Hong, Seogkyu Jeon, Junsoo Lee, Namhyuk Ahn, Kunhee Kim, Pilhyeon Lee, Daesik Kim, Youngjung Uh, Hyeran Byun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To deliver the artistic expression of the target style, recent studies
+exploit the attention mechanism owing to its ability to map the local patches
+of the style image to the corresponding patches of the content image. However,
+because of the low semantic correspondence between arbitrary content and
+artworks, the attention module repeatedly abuses specific local patches from
+the style image, resulting in disharmonious and evident repetitive artifacts.
+To overcome this limitation and accomplish impeccable artistic style transfer,
+we focus on enhancing the attention mechanism and capturing the rhythm of
+patterns that organize the style. In this paper, we introduce a novel metric,
+namely pattern repeatability, that quantifies the repetition of patterns in the
+style image. Based on the pattern repeatability, we propose Aesthetic
+Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot
+of local and global style expressions. In addition, we propose a novel
+self-supervisory task to encourage the attention mechanism to learn precise and
+meaningful semantic correspondence. Lastly, we introduce the patch-wise style
+loss to transfer the elaborate rhythm of local patterns. Through qualitative
+and quantitative evaluations, we verify the reliability of the proposed pattern
+repeatability that aligns with human perception, and demonstrate the
+superiority of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code is available at this
+  https://github.com/Kibeom-Hong/AesPA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and
+  Human-Machine Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Knoche, Gerhard Rigoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, face recognition systems surpass human performance on several
+datasets. However, there are still edge cases that the machine can't correctly
+classify. This paper investigates the effect of a combination of machine and
+human operators in the face verification task. First, we look closer at the
+edge cases for several state-of-the-art models to discover common datasets'
+challenging settings. Then, we conduct a study with 60 participants on these
+selected tasks with humans and provide an extensive analysis. Finally, we
+demonstrate that combining machine and human decisions can further improve the
+performance of state-of-the-art face verification systems on various benchmark
+datasets. Code and data are publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MS-DETR: Multispectral Pedestrian Detection <span class="highlight-title">Transformer</span> with Loosely
+  Coupled Fusion and Modality-Balanced Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00290v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00290v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghui Xing, Song Wang, Shizhou Zhang, Guoqiang Liang, Xiuwei Zhang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multispectral pedestrian detection is an important task for many
+around-the-clock applications, since the visible and thermal modalities can
+provide complementary information especially under low light conditions. Most
+of the available multispectral pedestrian detectors are based on non-end-to-end
+detectors, while in this paper, we propose MultiSpectral pedestrian DEtection
+TRansformer (MS-DETR), an end-to-end multispectral pedestrian detector, which
+extends DETR into the field of multi-modal detection. MS-DETR consists of two
+modality-specific backbones and Transformer encoders, followed by a multi-modal
+Transformer decoder, and the visible and thermal features are fused in the
+multi-modal Transformer decoder. To well resist the misalignment between
+multi-modal images, we design a loosely coupled fusion strategy by sparsely
+sampling some keypoints from multi-modal features independently and fusing them
+with adaptively learned attention weights. Moreover, based on the insight that
+not only different modalities, but also different pedestrian instances tend to
+have different confidence scores to final detection, we further propose an
+instance-aware modality-balanced optimization strategy, which preserves visible
+and thermal decoder branches and aligns their predicted slots through an
+instance-wise dynamic loss. Our end-to-end MS-DETR shows superior performance
+on the challenging KAIST, CVC-14 and LLVIP benchmark datasets. The source code
+is available at https://github.com/YinghuiXing/MS-DETR .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Density-invariant Features for Distant Point Cloud Registration <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Liu, Hongzi Zhu, Yunsong Zhou, Hongyang Li, Shan Chang, Minyi Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Registration of distant outdoor LiDAR point clouds is crucial to extending
+the 3D vision of collaborative autonomous vehicles, and yet is challenging due
+to small overlapping area and a huge disparity between observed point
+densities. In this paper, we propose Group-wise Contrastive Learning (GCL)
+scheme to extract density-invariant geometric features to register distant
+outdoor LiDAR point clouds. We mark through theoretical analysis and
+experiments that, contrastive positives should be independent and identically
+distributed (i.i.d.), in order to train densityinvariant feature extractors. We
+propose upon the conclusion a simple yet effective training scheme to force the
+feature of multiple point clouds in the same spatial location (referred to as
+positive groups) to be similar, which naturally avoids the sampling bias
+introduced by a pair of point clouds to conform with the i.i.d. principle. The
+resulting fully-convolutional feature extractor is more powerful and
+density-invariant than state-of-the-art methods, improving the registration
+recall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and
+26.9%, respectively. Code is available at https://github.com/liuQuan98/GCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the IEEE/CVF International Conference on Computer
+  Vision (ICCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking Neural Networks for event-based action recognition: A new task
+  to understand their advantage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Vicente-Sola, Davide L. Manna, Paul Kirkland, Gaetano Di Caterina, Trevor Bihl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNN) are characterised by their unique temporal
+dynamics, but the properties and advantages of such computations are still not
+well understood. In order to provide answers, in this work we demonstrate how
+Spiking neurons can enable temporal feature extraction in feed-forward neural
+networks without the need for recurrent synapses, showing how their
+bio-inspired computing principles can be successfully exploited beyond energy
+efficiency gains and evidencing their differences with respect to conventional
+neurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain
+(DVS-GC), which allows, for the first time, to evaluate the perception of
+temporal dependencies in a real event-based action recognition dataset. Our
+study proves how the widely used DVS Gesture benchmark could be solved by
+networks without temporal feature extraction, unlike the new DVS-GC which
+demands an understanding of the ordering of the events. Furthermore, this setup
+allowed us to unveil the role of the leakage rate in spiking neurons for
+temporal processing tasks and demonstrated the benefits of "hard reset"
+mechanisms. Additionally, we also show how time-dependent weights and
+normalization can lead to understanding order by means of temporal attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New article superseding the one in previous versions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Diffusion Probabilistic Models for Generation of Realistic
+  Fully-Annotated Microscopy Image Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dennis Eschweiler, Rüveyda Yilmaz, Matisse Baumann, Ina Laube, Rijo Roy, Abin Jose, Daniel Brückner, Johannes Stegmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in computer vision have led to significant progress in the
+generation of realistic image data, with denoising diffusion probabilistic
+models proving to be a particularly effective method. In this study, we
+demonstrate that diffusion models can effectively generate fully-annotated
+microscopy image data sets through an unsupervised and intuitive approach,
+using rough sketches of desired structures as the starting point. The proposed
+pipeline helps to reduce the reliance on manual annotations when training deep
+learning-based segmentation approaches and enables the segmentation of diverse
+datasets without the need for human annotations. This approach holds great
+promise in streamlining the data generation process and enabling a more
+efficient and scalable training of segmentation models, as we show in the
+example of different practical experiments involving various organisms and cell
+types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-manipulation of soft-materials estimating deformation from depth
+  images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05609v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05609v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Nicola, Enrico Villagrossi, Nicola Pedrocchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot co-manipulation of soft materials, such as fabrics, composites,
+and sheets of paper/cardboard, is a challenging operation that presents several
+relevant industrial applications. Estimating the deformation state of the
+co-manipulated material is one of the main challenges. Viable methods provide
+the indirect measure by calculating the human-robot relative distance. In this
+paper, we develop a data-driven model to estimate the deformation state of the
+material from a depth image through a Convolutional Neural Network (CNN).
+First, we define the deformation state of the material as the relative
+roto-translation from the current robot pose and a human grasping position. The
+model estimates the current deformation state through a Convolutional Neural
+Network, specifically a DenseNet-121 pretrained on ImageNet.The delta between
+the current and the desired deformation state is fed to the robot controller
+that outputs twist commands. The paper describes the developed approach to
+acquire, preprocess the dataset and train the model. The model is compared with
+the current state-of-the-art method based on a skeletal tracker from cameras.
+Results show that our approach achieves better performances and avoids the
+various drawbacks caused by using a skeletal tracker.Finally, we also studied
+the model performance according to different architectures and dataset
+dimensions to minimize the time required for dataset acquisition
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print, Accepted to Robotics and Computer Integrated Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Document Layout Annotation: Database and Benchmark in the Domain of
+  Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia, Marcos Grande, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every day, thousands of digital documents are generated with useful
+information for companies, public organizations, and citizens. Given the
+impossibility of processing them manually, the automatic processing of these
+documents is becoming increasingly necessary in certain sectors. However, this
+task remains challenging, since in most cases a text-only based parsing is not
+enough to fully understand the information presented through different
+components of varying significance. In this regard, Document Layout Analysis
+(DLA) has been an interesting research field for many years, which aims to
+detect and classify the basic components of a document. In this work, we used a
+procedure to semi-automatically annotate digital documents with different
+layout labels, including 4 basic layout blocks and 4 text categories. We apply
+this procedure to collect a novel database for DLA in the public affairs
+domain, using a set of 24 data sources from the Spanish Administration. The
+database comprises 37.9K documents with more than 441K document pages, and more
+than 8M labels associated to 8 layout block units. The results of our
+experiments validate the proposed text labeling procedure with accuracy up to
+99%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for
+  Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generation of Realistic Synthetic Raw Radar Data for Automated Driving
+  Applications using Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo C. Fidelis, Fabio Reway, Herick Y. S. Ribeiro, Pietro L. Campos, Werner Huber, Christian Icking, Lester A. Faria, Torsten Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main approaches for simulating FMCW radar are based on ray tracing, which
+is usually computationally intensive and do not account for background noise.
+This work proposes a faster method for FMCW radar simulation capable of
+generating synthetic raw radar data using generative adversarial networks
+(GAN). The code and pre-trained weights are open-source and available on
+GitHub. This method generates 16 simultaneous chirps, which allows the
+generated data to be used for the further development of algorithms for
+processing radar data (filtering and clustering). This can increase the
+potential for data augmentation, e.g., by generating data in non-existent or
+safety-critical scenarios that are not reproducible in real life. In this work,
+the GAN was trained with radar measurements of a motorcycle and used to
+generate synthetic raw radar data of a motorcycle traveling in a straight line.
+For generating this data, the distance of the motorcycle and Gaussian noise are
+used as input to the neural network. The synthetic generated radar chirps were
+evaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth
+(RA) map is calculated twice: first, based on synthetic data using this GAN
+and, second, based on real data. Based on these RA maps, an algorithm with
+adaptive threshold and edge detection is used for object detection. The results
+have shown that the data is realistic in terms of coherent radar reflections of
+the motorcycle and background noise based on the comparison of chirps, the RA
+maps and the object detection results. Thus, the proposed method in this work
+has shown to minimize the simulation-to-reality gap for the generation of radar
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A lightweight target detection algorithm based on Mobilenet Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.03729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.03729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengquan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target detection algorithm based on deep learning needs high computer GPU
+configuration, even need to use high performance deep learning workstation,
+this not only makes the cost increase, also greatly limits the realizability of
+the ground, this paper introduces a kind of lightweight algorithm for target
+detection under the condition of the balance accuracy and computational
+efficiency, MobileNet as Backbone performs parameter The processing speed is
+30fps on the RTX2060 card for images with the CNN separator layer. The
+processing speed is 30fps on the RTX2060 card for images with a resolution of
+320*320.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMD: Unsupervised Model Detection for X2X Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18651v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18651v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xiang, Zidi Xiong, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor (Trojan) attack is a common threat to deep neural networks, where
+samples from one or more source classes embedded with a backdoor trigger will
+be misclassified to adversarial target classes. Existing methods for detecting
+whether a classifier is backdoor attacked are mostly designed for attacks with
+a single adversarial target (e.g., all-to-one attack). To the best of our
+knowledge, without supervision, no existing methods can effectively address the
+more general X2X attack with an arbitrary number of source classes, each paired
+with an arbitrary target class. In this paper, we propose UMD, the first
+Unsupervised Model Detection method that effectively detects X2X backdoor
+attacks via a joint inference of the adversarial (source, target) class pairs.
+In particular, we first define a novel transferability statistic to measure and
+select a subset of putative backdoor class pairs based on a proposed clustering
+approach. Then, these selected class pairs are jointly assessed based on an
+aggregation of their reverse-engineered trigger size for detection inference,
+using a robust and unsupervised anomaly detector we proposed. We conduct
+comprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show
+that our unsupervised UMD outperforms SOTA detectors (even with supervision) by
+17%, 4%, and 8%, respectively, in terms of the detection accuracy against
+diverse X2X attacks. We also show the strong detection performance of UMD
+against several strong adaptive attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 40th International Conference on Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swin-<span class="highlight-title">transformer</span>-yolov5 For Real-time Wine Grape Bunch Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.14508v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.14508v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenglian Lu, Xiaoyu Liu, Zixaun He, Wenbo Liu, Xin Zhang, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research, an integrated detection model, Swin-transformer-YOLOv5 or
+Swin-T-YOLOv5, was proposed for real-time wine grape bunch detection to inherit
+the advantages from both YOLOv5 and Swin-transformer. The research was
+conducted on two different grape varieties of Chardonnay (always white berry
+skin) and Merlot (white or white-red mix berry skin when immature; red when
+matured) from July to September in 2019. To verify the superiority of
+Swin-T-YOLOv5, its performance was compared against several commonly
+used/competitive object detectors, including Faster R-CNN, YOLOv3, YOLOv4, and
+YOLOv5. All models were assessed under different test conditions, including two
+different weather conditions (sunny and cloudy), two different berry maturity
+stages (immature and mature), and three different sunlight
+directions/intensities (morning, noon, and afternoon) for a comprehensive
+comparison. Additionally, the predicted number of grape bunches by
+Swin-T-YOLOv5 was further compared with ground truth values, including both
+in-field manual counting and manual labeling during the annotation process.
+Results showed that the proposed Swin-T-YOLOv5 outperformed all other studied
+models for grape bunch detection, with up to 97% of mean Average Precision
+(mAP) and 0.89 of F1-score when the weather was cloudy. This mAP was
+approximately 44%, 18%, 14%, and 4% greater than Faster R-CNN, YOLOv3, YOLOv4,
+and YOLOv5, respectively. Swin-T-YOLOv5 achieved its lowest mAP (90%) and
+F1-score (0.82) when detecting immature berries, where the mAP was
+approximately 40%, 5%, 3%, and 1% greater than the same. Furthermore,
+Swin-T-YOLOv5 performed better on Chardonnay variety with achieved up to 0.91
+of R2 and 2.36 root mean square error (RMSE) when comparing the predictions
+with ground truth. However, it underperformed on Merlot variety with achieved
+only up to 0.70 of R2 and 3.30 of RMSE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages; 15 figures;Corresponding author: Xin Zhang Department of
+  Agricultural and Biological Engineering Mississippi State University
+  Mississippi State, MS 39762, USA (xzhang@abe.msstate.edu)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Industrial Image Anomaly Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11514v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11514v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Liu, Guoyang Xie, Jingbao Wang, Shangnian Li, Chengjie Wang, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rapid development of deep learning has laid a milestone in
+industrial Image Anomaly Detection (IAD). In this paper, we provide a
+comprehensive review of deep learning-based image anomaly detection techniques,
+from the perspectives of neural network architectures, levels of supervision,
+loss functions, metrics and datasets. In addition, we extract the new setting
+from industrial manufacturing and review the current IAD approaches under our
+proposed our new setting. Moreover, we highlight several opening challenges for
+image anomaly detection. The merits and downsides of representative network
+architectures under varying supervision are discussed. Finally, we summarize
+the research findings and point out future research directions. More resources
+are available at
+https://github.com/M-3LAB/awesome-industrial-anomaly-detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation
+  for Autonomous Driving <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06209v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06209v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yuan, Shuzhi Yu, Hannah Kim, Carlo Tomasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised optical flow estimation is especially hard near occlusions and
+motion boundaries and in low-texture regions. We show that additional
+information such as semantics and domain knowledge can help better constrain
+this problem. We introduce SemARFlow, an unsupervised optical flow network
+designed for autonomous driving data that takes estimated semantic segmentation
+masks as additional inputs. This additional information is injected into the
+encoder and into a learned upsampler that refines the flow output. In addition,
+a simple yet effective semantic augmentation module provides self-supervision
+when learning flow and its boundaries for vehicles, poles, and sky. Together,
+these injections of semantic information improve the KITTI-2015 optical flow
+test error rate from 11.80% to 8.38%. We also show visible improvements around
+object boundaries as well as a greater ability to generalize across datasets.
+Code is available at
+https://github.com/duke-vision/semantic-unsup-flow-release.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV-2023; Code is available at
+  https://github.com/duke-vision/semantic-unsup-flow-release</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14016v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14016v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Shen, Jianlong Jin, Ruixin Zhang, Huaen Li, Kai Zhao, Yingyi Zhang, Jingyun Zhang, Shouhong Ding, Yang Zhao, Wei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint recently shows great potential in recognition applications as it is
+a privacy-friendly and stable biometric. However, the lack of large-scale
+public palmprint datasets limits further research and development of palmprint
+recognition. In this paper, we propose a novel realistic pseudo-palmprint
+generation (RPG) model to synthesize palmprints with massive identities. We
+first introduce a conditional modulation generator to improve the intra-class
+diversity. Then an identity-aware loss is proposed to ensure identity
+consistency against unpaired training. We further improve the B\'ezier palm
+creases generation strategy to guarantee identity independence. Extensive
+experimental results demonstrate that synthetic pretraining significantly
+boosts the recognition model performance. For example, our model improves the
+state-of-the-art B\'ezierPalm by more than $5\%$ and $14\%$ in terms of
+TAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only
+$10\%$ of the real training data, our method still outperforms ArcFace with
+$100\%$ real training data, indicating that we are closer to real-data-free
+palmprint recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lifting-based variational multiclass segmentation: design, analysis and
+  implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.04680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.04680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadja Gruber, Johannes Schwab, Sebastien Court, Elke Gizewski, Markus Haltmeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose, analyze and realize a variational multiclass segmentation scheme
+that partitions a given image into multiple regions exhibiting specific
+properties. Our method determines multiple functions that encode the
+segmentation regions by minimizing an energy functional combining information
+from different channels. Multichannel image data can be obtained by lifting the
+image into a higher dimensional feature space using specific multichannel
+filtering or may already be provided by the imaging modality under
+consideration, such as an RGB image or multimodal medical data. Experimental
+results show that the proposed method performs well in various scenarios. In
+particular, promising results are presented for two medical applications
+involving classification of brain abscess and tumor growth, respectively. As
+main theoretical contributions, we prove the existence of global minimizers of
+the proposed energy functional and show its stability and convergence with
+respect to noisy inputs. In particular, these results also apply to the special
+case of binary segmentation, and these results are also novel in this
+particular situation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">survey</span> of the Vision <span class="highlight-title">Transformer</span>s and its CNN-<span class="highlight-title">Transformer</span> based
+  Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asifullah Khan, Zunaira Rauf, Anabia Sohail, Abdul Rehman, Hifsa Asif, Aqsa Asif, Umair Farooq
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have become popular as a possible substitute to
+convolutional neural networks (CNNs) for a variety of computer vision
+applications. These transformers, with their ability to focus on global
+relationships in images, offer large learning capacity. However, they may
+suffer from limited generalization as they do not tend to model local
+correlation in images. Recently, in vision transformers hybridization of both
+the convolution operation and self-attention mechanism has emerged, to exploit
+both the local and global image representations. These hybrid vision
+transformers, also referred to as CNN-Transformer architectures, have
+demonstrated remarkable results in vision applications. Given the rapidly
+growing number of hybrid vision transformers, it has become necessary to
+provide a taxonomy and explanation of these hybrid architectures. This survey
+presents a taxonomy of the recent vision transformer architectures and more
+specifically that of the hybrid vision transformers. Additionally, the key
+features of these architectures such as the attention mechanisms, positional
+embeddings, multi-scale processing, and convolution are also discussed. In
+contrast to the previous survey papers that are primarily focused on individual
+vision transformer architectures or CNNs, this survey uniquely emphasizes the
+emerging trend of hybrid vision transformers. By showcasing the potential of
+hybrid vision transformers to deliver exceptional performance across a range of
+computer vision tasks, this survey sheds light on the future directions of this
+rapidly evolving architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages: 58, Figures: 14</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01006v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01006v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengju Ye, Wei Jing, Chunyong Hu, Shikun Huang, Lingping Gao, Fangzhen Li, Jingke Wang, Ke Guo, Wencong Xiao, Weibo Mao, Hang Zheng, Kun Li, Junbo Chen, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building a multi-modality multi-task neural network toward accurate and
+robust performance is a de-facto standard in perception task of autonomous
+driving. However, leveraging such data from multiple sensors to jointly
+optimize the prediction and planning tasks remains largely unexplored. In this
+paper, we present FusionAD, to the best of our knowledge, the first unified
+framework that fuse the information from two most critical sensors, camera and
+LiDAR, goes beyond perception task. Concretely, we first build a transformer
+based multi-modality fusion network to effectively produce fusion based
+features. In constrast to camera-based end-to-end method UniAD, we then
+establish a fusion aided modality-aware prediction and status-aware planning
+modules, dubbed FMSPnP that take advantages of multi-modality features. We
+conduct extensive experiments on commonly used benchmark nuScenes dataset, our
+FusionAD achieves state-of-the-art performance and surpassing baselines on
+average 15% on perception tasks like detection and tracking, 10% on occupancy
+prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score
+and reduces the collision rate from 0.31% to only 0.12%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Coreset Selection for Efficient Robust Training <span class="chip">ECCV2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to adversarial attacks: adding well-crafted,
+imperceptible perturbations to their input can modify their output. Adversarial
+training is one of the most effective approaches to training robust models
+against such attacks. Unfortunately, this method is much slower than vanilla
+training of neural networks since it needs to construct adversarial examples
+for the entire training data at every iteration. By leveraging the theory of
+coreset selection, we show how selecting a small subset of training data
+provides a principled approach to reducing the time complexity of robust
+training. To this end, we first provide convergence guarantees for adversarial
+coreset selection. In particular, we show that the convergence bound is
+directly related to how well our coresets can approximate the gradient computed
+over the entire training data. Motivated by our theoretical analysis, we
+propose using this gradient approximation error as our adversarial coreset
+selection objective to reduce the training set size effectively. Once built, we
+run adversarial training over this subset of the training data. Unlike existing
+methods, our approach can be adapted to a wide variety of training objectives,
+including TRADES, $\ell_p$-PGD, and Perceptual Adversarial Training. We conduct
+extensive experiments to demonstrate that our approach speeds up adversarial
+training by 2-3 times while experiencing a slight degradation in the clean and
+robust accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the International Journal of Computer Vision (IJCV).
+  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:
+  substantial text overlap with arXiv:2112.00378</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tailed Recognition by Mutual Information Maximization between
+  Latent Features and Ground-Truth Labels <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min-Kook Suh, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although contrastive learning methods have shown prevailing performance on a
+variety of representation learning tasks, they encounter difficulty when the
+training dataset is long-tailed. Many researchers have combined contrastive
+learning and a logit adjustment technique to address this problem, but the
+combinations are done ad-hoc and a theoretical background has not yet been
+provided. The goal of this paper is to provide the background and further
+improve the performance. First, we show that the fundamental reason contrastive
+learning methods struggle with long-tailed tasks is that they try to maximize
+the mutual information maximization between latent features and input data. As
+ground-truth labels are not considered in the maximization, they are not able
+to address imbalances between class labels. Rather, we interpret the
+long-tailed recognition task as a mutual information maximization between
+latent features and ground-truth labels. This approach integrates contrastive
+learning and logit adjustment seamlessly to derive a loss function that shows
+state-of-the-art performance on long-tailed recognition benchmarks. It also
+demonstrates its efficacy in image segmentation tasks, verifying its
+versatility beyond image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feature Decoupling-Recycling Network for Fast Interactive Segmentation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huimin Zeng, Weinong Wang, Xin Tao, Zhiwei Xiong, Yu-Wing Tai, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent interactive segmentation methods iteratively take source image, user
+guidance and previously predicted mask as the input without considering the
+invariant nature of the source image. As a result, extracting features from the
+source image is repeated in each interaction, resulting in substantial
+computational redundancy. In this work, we propose the Feature
+Decoupling-Recycling Network (FDRN), which decouples the modeling components
+based on their intrinsic discrepancies and then recycles components for each
+user interaction. Thus, the efficiency of the whole interactive process can be
+significantly improved. To be specific, we apply the Decoupling-Recycling
+strategy from three perspectives to address three types of discrepancies,
+respectively. First, our model decouples the learning of source image semantics
+from the encoding of user guidance to process two types of input domains
+separately. Second, FDRN decouples high-level and low-level features from
+stratified semantic representations to enhance feature learning. Third, during
+the encoding of user guidance, current user guidance is decoupled from
+historical guidance to highlight the effect of current user guidance. We
+conduct extensive experiments on 6 datasets from different domains and
+modalities, which demonstrate the following merits of our model: 1) superior
+efficiency than other methods, particularly advantageous in challenging
+scenarios requiring long-term interactions (up to 4.25x faster), while
+achieving favorable segmentation performance; 2) strong applicability to
+various methods serving as a universal enhancement technique; 3) well
+cross-task generalizability, e.g., to medical image segmentation, and
+robustness against misleading user guidance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided
+  Enhancement <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhua Ning, Zhuotao Tian, Guangming Lu, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although extensive research has been conducted on 3D point cloud
+segmentation, effectively adapting generic models to novel categories remains a
+formidable challenge. This paper proposes a novel approach to improve point
+cloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods
+that directly utilize categorical information from support prototypes to
+recognize novel classes in query samples, our method identifies two critical
+aspects that substantially enhance model performance by reducing contextual
+gaps between support prototypes and query features. Specifically, we (1) adapt
+support background prototypes to match query context while removing extraneous
+cues that may obscure foreground and background in query samples, and (2)
+holistically rectify support prototypes under the guidance of query features to
+emulate the latter having no semantic gap to the query targets. Our proposed
+designs are agnostic to the feature extractor, rendering them readily
+applicable to any prototype-based methods. The experimental results on S3DIS
+and ScanNet demonstrate notable practical benefits, as our approach achieves
+significant improvements while still maintaining high efficiency. The code for
+our approach is available at
+https://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Learnable Multi-Scale Feature Compression for VCM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16670v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16670v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeongwoong Kim, Hyewon Jeong, Janghyun Yu, Younhee Kim, Jooyoung Lee, Se Yoon Jeong, Hui Yong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of deep learning-based machine vision applications has
+given rise to a new type of compression, so called video coding for machine
+(VCM). VCM differs from traditional video coding in that it is optimized for
+machine vision performance instead of human visual quality. In the feature
+compression track of MPEG-VCM, multi-scale features extracted from images are
+subject to compression. Recent feature compression works have demonstrated that
+the versatile video coding (VVC) standard-based approach can achieve a BD-rate
+reduction of up to 96% against MPEG-VCM feature anchor. However, it is still
+sub-optimal as VVC was not designed for extracted features but for natural
+images. Moreover, the high encoding complexity of VVC makes it difficult to
+design a lightweight encoder without sacrificing performance. To address these
+challenges, we propose a novel multi-scale feature compression method that
+enables both the end-to-end optimization on the extracted features and the
+design of lightweight encoders. The proposed model combines a learnable
+compressor with a multi-scale feature fusion network so that the redundancy in
+the multi-scale features is effectively removed. Instead of simply cascading
+the fusion network and the compression network, we integrate the fusion and
+encoding processes in an interleaved way. Our model first encodes a
+larger-scale feature to obtain a latent representation and then fuses the
+latent with a smaller-scale feature. This process is successively performed
+until the smallest-scale feature is fused and then the encoded latent at the
+final stage is entropy-coded for transmission. The results show that our model
+outperforms previous approaches by at least 52% BD-rate reduction and has
+$\times5$ to $\times27$ times less encoding time for object detection...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, accepted by IEEE Transactions on Circuits and Systems for
+  Video Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lawin <span class="highlight-title">Transformer</span>: Improving New-Era Vision Backbones with Multi-Scale
+  Representations for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.01615v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.01615v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Yan, Chuang Zhang, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-level aggregation (MLA) module has emerged as a critical component
+for advancing new-era vision back-bones in semantic segmentation. In this
+paper, we propose Lawin (large window) Transformer, a novel MLA architecture
+that creatively utilizes multi-scale feature maps from the vision backbone. At
+the core of Lawin Transformer is the Lawin attention, a newly designed window
+attention mechanism capable of querying much larger context windows than local
+windows. We focus on studying the efficient and simplistic application of the
+large-window paradigm, allowing for flexible regulation of the ratio of large
+context to query and capturing multi-scale representations. We validate the
+effectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently
+demonstrating great superiority to widely-used MLA modules when combined with
+new-era vision backbones. The code is available at
+https://github.com/yan-hao-tian/lawin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The latest version has really big differences from the original
+  version, which may make the reader confused. We will submit the latest
+  version as another article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Screen-based 3D Subjective Experiment Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn
+considerable efforts from academia and industry to assess their perceptual
+quality by conducting subjective experiments. However, lacking a handy software
+for 3D subjective experiments complicates the construction of 3D graphics
+quality assessment datasets, thus hindering the prosperity of relevant fields.
+In this paper, we develop a powerful platform with which users can flexibly
+design their 3D subjective methodologies and build high-quality datasets,
+easing a broad spectrum of 3D graphics subjective quality study. To accurately
+illustrate the perceptual quality differences of 3D stimuli, our software can
+simultaneously render the source stimulus and impaired stimulus and allows both
+stimuli to respond synchronously to viewer interactions. Compared with amateur
+3D visualization tool-based or image/video rendering-based schemes, our
+approach embodies typical 3D applications while minimizing cognitive overload
+during subjective experiments. We organized a subjective experiment involving
+40 participants to verify the validity of the proposed software. Experimental
+analyses demonstrate that subjective tests on our software can produce
+reasonable subjective quality scores of 3D models. All resources in this paper
+can be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two-Stream Regression Network for Dental Implant Position Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinquan Yang, Xuguang Li, Xuechen Li, Wenting Chen, Linlin Shen, Xin Li, Yongqiang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In implant prosthesis treatment, the design of the surgical guide heavily
+relies on the manual location of the implant position, which is subjective and
+prone to doctor's experiences. When deep learning based methods has started to
+be applied to address this problem, the space between teeth are various and
+some of them might present similar texture characteristic with the actual
+implant region. Both problems make a big challenge for the implant position
+prediction. In this paper, we develop a two-stream implant position regression
+framework (TSIPR), which consists of an implant region detector (IRD) and a
+multi-scale patch embedding regression network (MSPENet), to address this
+issue. For the training of IRD, we extend the original annotation to provide
+additional supervisory information, which contains much more rich
+characteristic and do not introduce extra labeling costs. A multi-scale patch
+embedding module is designed for the MSPENet to adaptively extract features
+from the images with various tooth spacing. The global-local feature
+interaction block is designed to build the encoder of MSPENet, which combines
+the transformer and convolution for enriched feature representation. During
+inference, the RoI mask extracted from the IRD is used to refine the prediction
+results of the MSPENet. Extensive experiments on a dental implant dataset
+through five-fold cross-validation demonstrated that the proposed TSIPR
+achieves superior performance than existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12676v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12676v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yasuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In past decade, previous balanced datasets have been used to advance
+algorithms for classification, object detection, semantic segmentation, and
+anomaly detection in industrial applications. Specifically, for condition-based
+maintenance, automating visual inspection is crucial to ensure high quality.
+Deterioration prognostic attempts to optimize the fine decision process for
+predictive maintenance and proactive repair. In civil infrastructure and living
+environment, damage data mining cannot avoid the imbalanced data issue because
+of rare unseen events and high quality status by improved operations. For
+visual inspection, deteriorated class acquired from the surface of concrete and
+steel components are occasionally imbalanced. From numerous related surveys, we
+summarize that imbalanced data problems can be categorized into four types; 1)
+missing range of target and label valuables, 2) majority-minority class
+imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class
+of pixel-wise imbalance. Since 2015, there has been many imbalanced studies
+using deep learning approaches that includes regression, image classification,
+object detection, semantic segmentation. However, anomaly detection for
+imbalanced data is not yet well known. In the study, we highlight one-class
+anomaly detection application whether anomalous class or not, and demonstrate
+clear examples on imbalanced vision datasets: blood smear, lung infection,
+hazardous driving, wooden, concrete deterioration, river sludge, and disaster
+damage. Illustrated in Fig.1, we provide key results on damage vision mining
+advantage, hypothesizing that the more effective range of positive ratio, the
+higher accuracy gain of anomaly detection application. In our imbalanced
+studies, compared with the balanced case of positive ratio 1/1, we find that
+there is applicable positive ratio, where the accuracy are consistently high.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 29 figures, 18 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptively Placed Multi-Grid Scene Representation Networks for
+  Large-Scale Data Visualization <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skylar Wolfgang Wurster, Tianyu Xiong, Han-Wei Shen, Hanqi Guo, Tom Peterka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene representation networks (SRNs) have been recently proposed for
+compression and visualization of scientific data. However, state-of-the-art
+SRNs do not adapt the allocation of available network parameters to the complex
+features found in scientific data, leading to a loss in reconstruction quality.
+We address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)
+and propose a domain decomposition training and inference technique for
+accelerated parallel training on multi-GPU systems. We also release an
+open-source neural volume rendering application that allows plug-and-play
+rendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses
+multiple spatially adaptive feature grids that learn where to be placed within
+the domain to dynamically allocate more neural network resources where error is
+high in the volume, improving state-of-the-art reconstruction accuracy of SRNs
+for scientific data without requiring expensive octree refining, pruning, and
+traversal like previous adaptive models. In our domain decomposition approach
+for representing large-scale data, we train an set of APMGSRNs in parallel on
+separate bricks of the volume to reduce training time while avoiding overhead
+necessary for an out-of-core solution for volumes too large to fit in GPU
+memory. After training, the lightweight SRNs are used for realtime neural
+volume rendering in our open-source renderer, where arbitrary view angles and
+transfer functions can be explored. A copy of this paper, all code, all models
+used in our experiments, and all supplemental materials and videos are
+available at https://github.com/skywolf829/APMGSRN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAE-DFER: Efficient Masked Autoencoder for <span class="highlight-title">Self-supervised</span> Dynamic
+  Facial Expression Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licai Sun, Zheng Lian, Bin Liu, Jianhua Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (DFER) is essential to the development
+of intelligent and empathetic machines. Prior efforts in this field mainly fall
+into supervised learning paradigm, which is severely restricted by the limited
+labeled data in existing datasets. Inspired by recent unprecedented success of
+masked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel
+self-supervised method which leverages large-scale self-supervised pre-training
+on abundant unlabeled data to largely advance the development of DFER. Since
+the vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial
+computation during fine-tuning, MAE-DFER develops an efficient local-global
+interaction Transformer (LGI-Former) as the encoder. Moreover, in addition to
+the standalone appearance content reconstruction in VideoMAE, MAE-DFER also
+introduces explicit temporal facial motion modeling to encourage LGI-Former to
+excavate both static appearance and dynamic motion information. Extensive
+experiments on six datasets show that MAE-DFER consistently outperforms
+state-of-the-art supervised methods by significant margins (e.g., +6.30\% UAR
+on DFEW and +8.34\% UAR on MAFW), verifying that it can learn powerful dynamic
+facial representations via large-scale self-supervised pre-training. Besides,
+it has comparable or even better performance than VideoMAE, while largely
+reducing the computational cost (about 38\% FLOPs). We believe MAE-DFER has
+paved a new way for the advancement of DFER and can inspire more relevant
+research in this field and even other related tasks. Codes and models are
+publicly available at https://github.com/sunlicai/MAE-DFER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 (camera ready). Codes and models are publicly available
+  at https://github.com/sunlicai/MAE-DFER</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatialyze: A Geospatial Video Analytics System with Spatial-Aware
+  Optimizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanwut Kittivorawong, Yongming Ge, Yousef Helal, Alvin Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videos that are shot using commodity hardware such as phones and surveillance
+cameras record various metadata such as time and location. We encounter such
+geospatial videos on a daily basis and such videos have been growing in volume
+significantly. Yet, we do not have data management systems that allow users to
+interact with such data effectively.
+  In this paper, we describe Spatialyze, a new framework for end-to-end
+querying of geospatial videos. Spatialyze comes with a domain-specific language
+where users can construct geospatial video analytic workflows using a 3-step,
+declarative, build-filter-observe paradigm. Internally, Spatialyze leverages
+the declarative nature of such workflows, the temporal-spatial metadata stored
+with videos, and physical behavior of real-world objects to optimize the
+execution of workflows. Our results using real-world videos and workflows show
+that Spatialyze can reduce execution time by up to 5.3x, while maintaining up
+to 97.1% accuracy compared to unoptimized execution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub Repository: https://github.com/apperception-db/spatialyze</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPTS v2: Single-Point Scene Text Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01635v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01635v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuliang Liu, Jiaxin Zhang, Dezhi Peng, Mingxin Huang, Xinyu Wang, Jingqun Tang, Can Huang, Dahua Lin, Chunhua Shen, Xiang Bai, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end scene text spotting has made significant progress due to its
+intrinsic synergy between text detection and recognition. Previous methods
+commonly regard manual annotations such as horizontal rectangles, rotated
+rectangles, quadrangles, and polygons as a prerequisite, which are much more
+expensive than using single-point. Our new framework, SPTS v2, allows us to
+train high-performing text-spotting models using a single-point annotation.
+SPTS v2 reserves the advantage of the auto-regressive Transformer with an
+Instance Assignment Decoder (IAD) through sequentially predicting the center
+points of all text instances inside the same predicting sequence, while with a
+Parallel Recognition Decoder (PRD) for text recognition in parallel. These two
+decoders share the same parameters and are interactively connected with a
+simple but effective information transmission process to pass the gradient and
+information. Comprehensive experiments on various existing benchmark datasets
+demonstrate the SPTS v2 can outperform previous state-of-the-art single-point
+text spotters with fewer parameters while achieving 19$\times$ faster inference
+speed. Within the context of our SPTS v2 framework, our experiments suggest a
+potential preference for single-point representation in scene text spotting
+when compared to other representations. Such an attempt provides a significant
+opportunity for scene text spotting applications beyond the realms of existing
+paradigms. Code is available at https://github.com/Yuliang-Liu/SPTSv2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2112.07917</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Label <span class="highlight-title">Self-Supervised</span> Learning with Scene Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03286v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03286v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhu, Minghao Fu, Jianxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) methods targeting scene images have seen a
+rapid growth recently, and they mostly rely on either a dedicated dense
+matching mechanism or a costly unsupervised object discovery module. This paper
+shows that instead of hinging on these strenuous operations, quality image
+representations can be learned by treating scene/multi-label image SSL simply
+as a multi-label classification problem, which greatly simplifies the learning
+framework. Specifically, multiple binary pseudo-labels are assigned for each
+input image by comparing its embeddings with those in two dictionaries, and the
+network is optimized using the binary cross entropy loss. The proposed method
+is named Multi-Label Self-supervised learning (MLS). Visualizations
+qualitatively show that clearly the pseudo-labels by MLS can automatically find
+semantically similar pseudo-positive pairs across different images to
+facilitate contrastive learning. MLS learns high quality representations on
+MS-COCO and achieves state-of-the-art results on classification, detection and
+segmentation benchmarks. At the same time, MLS is much simpler than existing
+methods, making it easier to deploy and for further exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Degeneration-Tuning: Using Scrambled Grid shield Unwanted Concepts from
+  Stable Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Ni, Longhui Wei, Jiacheng Li, Siliang Tang, Yueting Zhuang, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the unrestricted nature of the content in the training data, large
+text-to-image diffusion models, such as Stable Diffusion (SD), are capable of
+generating images with potentially copyrighted or dangerous content based on
+corresponding textual concepts information. This includes specific intellectual
+property (IP), human faces, and various artistic styles. However, Negative
+Prompt, a widely used method for content removal, frequently fails to conceal
+this content due to inherent limitations in its inference logic. In this work,
+we propose a novel strategy named \textbf{Degeneration-Tuning (DT)} to shield
+contents of unwanted concepts from SD weights. By utilizing Scrambled Grid to
+reconstruct the correlation between undesired concepts and their corresponding
+image domain, we guide SD to generate meaningless content when such textual
+concepts are provided as input. As this adaptation occurs at the level of the
+model's weights, the SD, after DT, can be grafted onto other conditional
+diffusion frameworks like ControlNet to shield unwanted concepts. In addition
+to qualitatively showcasing the effectiveness of our DT method in protecting
+various types of concepts, a quantitative comparison of the SD before and after
+DT indicates that the DT method does not significantly impact the generative
+quality of other contents. The FID and IS scores of the model on COCO-30K
+exhibit only minor changes after DT, shifting from 12.61 and 39.20 to 13.04 and
+38.25, respectively, which clearly outperforms the previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Visual Exploration Based on Attention-Map Entropy <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Pardyl, Grzegorz Rypeść, Grzegorz Kurzejamski, Bartosz Zieliński, Tomasz Trzciński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active visual exploration addresses the issue of limited sensor capabilities
+in real-world scenarios, where successive observations are actively chosen
+based on the environment. To tackle this problem, we introduce a new technique
+called Attention-Map Entropy (AME). It leverages the internal uncertainty of
+the transformer-based model to determine the most informative observations. In
+contrast to existing solutions, it does not require additional loss components,
+which simplifies the training. Through experiments, which also mimic
+retina-like sensors, we show that such simplified training significantly
+improves the performance of reconstruction, segmentation and classification on
+publicly available datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M$^2$-3DLaneNet: Exploring Multi-Modal 3D Lane Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05996v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05996v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueru Luo, Xu Yan, Chaoda Zheng, Chao Zheng, Shuqi Mei, Tang Kun, Shuguang Cui, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating accurate lane lines in 3D space remains challenging due to their
+sparse and slim nature. Previous works mainly focused on using images for 3D
+lane detection, leading to inherent projection error and loss of geometry
+information. To address these issues, we explore the potential of leveraging
+LiDAR for 3D lane detection, either as a standalone method or in combination
+with existing monocular approaches. In this paper, we propose M$^2$-3DLaneNet
+to integrate complementary information from multiple sensors. Specifically,
+M$^2$-3DLaneNet lifts 2D features into 3D space by incorporating geometry
+information from LiDAR data through depth completion. Subsequently, the lifted
+2D features are further enhanced with LiDAR features through cross-modality BEV
+fusion. Extensive experiments on the large-scale OpenLane dataset demonstrate
+the effectiveness of M$^2$-3DLaneNet, regardless of the range (75m or 100m).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Actor-agnostic Multi-label Action Recognition with Multi-modal Query <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Mondal, Sauradip Nag, Joaquin M Prada, Xiatian Zhu, Anjan Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing action recognition methods are typically actor-specific due to the
+intrinsic topological and apparent differences among the actors. This requires
+actor-specific pose estimation (e.g., humans vs. animals), leading to
+cumbersome model design complexity and high maintenance costs. Moreover, they
+often focus on learning the visual modality alone and single-label
+classification whilst neglecting other available information sources (e.g.,
+class name text) and the concurrent occurrence of multiple actions. To overcome
+these limitations, we propose a new approach called 'actor-agnostic multi-modal
+multi-label action recognition,' which offers a unified solution for various
+types of actors, including humans and animals. We further formulate a novel
+Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object
+detection framework (e.g., DETR), characterized by leveraging visual and
+textual modalities to represent the action classes better. The elimination of
+actor-specific model designs is a key advantage, as it removes the need for
+actor pose estimation altogether. Extensive experiments on five publicly
+available benchmarks show that our MSQNet consistently outperforms the prior
+arts of actor-specific alternatives on human and animal single- and multi-label
+action recognition tasks by up to 50%. Code will be released at
+https://github.com/mondalanindya/MSQNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 IEEE/CVF International Conference on Computer
+  Vision Workshops (ICCVW), Paris, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffIR: Efficient Diffusion Model for Image Restoration <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09472v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09472v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Xia, Yulun Zhang, Shiyin Wang, Yitong Wang, Xinglong Wu, Yapeng Tian, Wenming Yang, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion model (DM) has achieved SOTA performance by modeling the image
+synthesis process into a sequential application of a denoising network.
+However, different from image synthesis, image restoration (IR) has a strong
+constraint to generate results in accordance with ground-truth. Thus, for IR,
+traditional DMs running massive iterations on a large model to estimate whole
+images or feature maps is inefficient. To address this issue, we propose an
+efficient DM for IR (DiffIR), which consists of a compact IR prior extraction
+network (CPEN), dynamic IR transformer (DIRformer), and denoising network.
+Specifically, DiffIR has two training stages: pretraining and training DM. In
+pretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact
+IR prior representation (IPR) to guide DIRformer. In the second stage, we train
+the DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using
+LQ images. We observe that since the IPR is only a compact vector, DiffIR can
+use fewer iterations than traditional DM to obtain accurate estimations and
+generate more stable and realistic results. Since the iterations are few, our
+DiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising
+network, which can further reduce the estimation error influence. We conduct
+extensive experiments on several IR tasks and achieve SOTA performance while
+consuming less computational costs. Code is available at
+\url{https://github.com/Zj-BinXia/DiffIR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by ICCV2023. Codes and models are available at
+  https://github.com/Zj-BinXia/DiffIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralization and Acceleration Enables Large-Scale Bundle Adjustment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07026v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07026v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taosha Fan, Joseph Ortiz, Ming Hsiao, Maurizio Monge, Jing Dong, Todd Murphey, Mustafa Mukadam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling to arbitrarily large bundle adjustment problems requires data and
+compute to be distributed across multiple devices. Centralized methods in prior
+works are only able to solve small or medium size problems due to overhead in
+computation and communication. In this paper, we present a fully decentralized
+method that alleviates computation and communication bottlenecks to solve
+arbitrarily large bundle adjustment problems. We achieve this by reformulating
+the reprojection error and deriving a novel surrogate function that decouples
+optimization variables from different devices. This function makes it possible
+to use majorization minimization techniques and reduces bundle adjustment to
+independent optimization subproblems that can be solved in parallel. We further
+apply Nesterov's acceleration and adaptive restart to improve convergence while
+maintaining its theoretical guarantees. Despite limited peer-to-peer
+communication, our method has provable convergence to first-order critical
+points under mild conditions. On extensive benchmarks with public datasets, our
+method converges much faster than decentralized baselines with similar memory
+usage and communication load. Compared to centralized baselines using a single
+device, our method, while being decentralized, yields more accurate solutions
+with significant speedups of up to 953.7x over Ceres and 174.6x over DeepLM.
+Code: https://joeaortiz.github.io/daba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Robotics: Science and Systems (RSS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Archangel: A Hybrid UAV-based Human Detection Benchmark with Position
+  and Pose Metadata 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.00128v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.00128v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Ting Shen, Yaesop Lee, Heesung Kwon, Damon M. Conover, Shuvra S. Bhattacharyya, Nikolas Vale, Joshua D. Gray, G. Jeremy Leong, Kenneth Evensen, Frank Skirlo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning to detect objects, such as humans, in imagery captured by an
+unmanned aerial vehicle (UAV) usually suffers from tremendous variations caused
+by the UAV's position towards the objects. In addition, existing UAV-based
+benchmark datasets do not provide adequate dataset metadata, which is essential
+for precise model diagnosis and learning features invariant to those
+variations. In this paper, we introduce Archangel, the first UAV-based object
+detection dataset composed of real and synthetic subsets captured with similar
+imagining conditions and UAV position and object pose metadata. A series of
+experiments are carefully designed with a state-of-the-art object detector to
+demonstrate the benefits of leveraging the metadata during model evaluation.
+Moreover, several crucial insights involving both real and synthetic data
+during model optimization are presented. In the end, we discuss the advantages,
+limitations, and future directions regarding Archangel to highlight its
+distinct value for the broader machine learning community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">17</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Natural-Language Based Audio Retrieval with PaSST and Large
+  Audio-Caption Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Primus, Khaled Koutini, Gerhard Widmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a text-to-audio-retrieval system based on pre-trained text
+and spectrogram transformers. Our method projects recordings and textual
+descriptions into a shared audio-caption space in which related examples from
+different modalities are close. Through a systematic analysis, we examine how
+each component of the system influences retrieval performance. As a result, we
+identify two key components that play a crucial role in driving performance:
+the self-attention-based audio encoder for audio embedding and the utilization
+of additional human-generated and synthetic data sets during pre-training. We
+further experimented with augmenting ClothoV2 captions with available keywords
+to increase their variety; however, this only led to marginal improvements. Our
+system ranked first in the 2023's DCASE Challenge, and it outperforms the
+current state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to DCASE Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniRecSys: A Unified Framework for Personalized, Group, Package, and
+  Package-to-Group Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adamya Shyam, Vikas Kumar, Venkateswara Rao Kagita, Arun K Pujari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems aim to enhance the overall user experience by providing
+tailored recommendations for a variety of products and services. These systems
+help users make more informed decisions, leading to greater user satisfaction
+with the platform. However, the implementation of these systems largely depends
+on the context, which can vary from recommending an item or package to a user
+or a group. This requires careful exploration of several models during the
+deployment, as there is no comprehensive and unified approach that deals with
+recommendations at different levels. Furthermore, these individual models must
+be closely attuned to their generated recommendations depending on the context
+to prevent significant variation in their generated recommendations. In this
+paper, we propose a novel unified recommendation framework that addresses all
+four recommendation tasks, namely personalized, group, package, or
+package-to-group recommendation, filling the gap in the current research
+landscape. The proposed framework can be integrated with most of the
+traditional matrix factorization-based collaborative filtering models. The idea
+is to enhance the formulation of the existing approaches by incorporating
+components focusing on the exploitation of the group and package latent
+factors. These components also help in exploiting a rich latent representation
+of the user/item by enforcing them to align closely with their corresponding
+group/package representation. We consider two prominent CF techniques,
+Regularized Matrix Factorization and Maximum Margin Matrix factorization, as
+the baseline models and demonstrate their customization to various
+recommendation tasks. Experiment results on two publicly available datasets are
+reported, comparing them to other baseline approaches that consider individual
+rating feedback for group or package recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpinionConv: Conversational Product Search with Grounded Opinions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahid Sadiri Javadi, Martin Potthast, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When searching for products, the opinions of others play an important role in
+making informed decisions. Subjective experiences about a product can be a
+valuable source of information. This is also true in sales conversations, where
+a customer and a sales assistant exchange facts and opinions about products.
+However, training an AI for such conversations is complicated by the fact that
+language models do not possess authentic opinions for their lack of real-world
+experience. We address this problem by leveraging product reviews as a rich
+source of product opinions to ground conversational AI in true subjective
+narratives. With OpinionConv, we develop the first conversational AI for
+simulating sales conversations. To validate the generated conversations, we
+conduct several user studies showing that the generated opinions are perceived
+as realistic. Our assessors also confirm the importance of opinions as an
+informative basis for decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding and Modeling Passive-Negative Feedback for Short-video
+  Sequential Recommendation <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhu Pan, Chen Gao, Jianxin Chang, Yanan Niu, Yang Song, Kun Gai, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation is one of the most important tasks in recommender
+systems, which aims to recommend the next interacted item with historical
+behaviors as input. Traditional sequential recommendation always mainly
+considers the collected positive feedback such as click, purchase, etc.
+However, in short-video platforms such as TikTok, video viewing behavior may
+not always represent positive feedback. Specifically, the videos are played
+automatically, and users passively receive the recommended videos. In this new
+scenario, users passively express negative feedback by skipping over videos
+they do not like, which provides valuable information about their preferences.
+Different from the negative feedback studied in traditional recommender
+systems, this passive-negative feedback can reflect users' interests and serve
+as an important supervision signal in extracting users' preferences. Therefore,
+it is essential to carefully design and utilize it in this novel recommendation
+scenario. In this work, we first conduct analyses based on a large-scale
+real-world short-video behavior dataset and illustrate the significance of
+leveraging passive feedback. We then propose a novel method that deploys the
+sub-interest encoder, which incorporates positive feedback and passive-negative
+feedback as supervision signals to learn the user's current active
+sub-interest. Moreover, we introduce an adaptive fusion layer to integrate
+various sub-interests effectively. To enhance the robustness of our model, we
+then introduce a multi-task learning module to simultaneously optimize two
+kinds of feedback -- passive-negative feedback and traditional randomly-sampled
+negative feedback. The experiments on two large-scale datasets verify that the
+proposed method can significantly outperform state-of-the-art approaches. The
+code is released at https://github.com/tsinghua-fib-lab/RecSys2023-SINE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Distillation-enhanced Multi-modal <span class="highlight-title">Transformer</span> for Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Ji, Xiangyan Liu, An Zhang, Yinwei Wei, Yongxin Ni, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal recommendation systems, which integrate diverse types of
+information, have gained widespread attention in recent years. However,
+compared to traditional collaborative filtering-based multi-modal
+recommendation systems, research on multi-modal sequential recommendation is
+still in its nascent stages. Unlike traditional sequential recommendation
+models that solely rely on item identifier (ID) information and focus on
+network structure design, multi-modal recommendation models need to emphasize
+item representation learning and the fusion of heterogeneous data sources. This
+paper investigates the impact of item representation learning on downstream
+recommendation tasks and examines the disparities in information fusion at
+different stages. Empirical experiments are conducted to demonstrate the need
+to design a framework suitable for collaborative learning and fusion of diverse
+information. Based on this, we propose a new model-agnostic framework for
+multi-modal sequential recommendation tasks, called Online
+Distillation-enhanced Multi-modal Transformer (ODMT), to enhance feature
+interaction and mutual learning among multi-source input (ID, text, and image),
+while avoiding conflicts among different features during training, thereby
+improving recommendation accuracy. To be specific, we first introduce an
+ID-aware Multi-modal Transformer module in the item representation learning
+stage to facilitate information interaction among different features. Secondly,
+we employ an online distillation training strategy in the prediction
+optimization stage to make multi-source data learn from each other and improve
+prediction robustness. Experimental results on a video content recommendation
+dataset and three e-commerce recommendation datasets demonstrate the
+effectiveness of the proposed two modules, which is approximately 10%
+improvement in performance compared to baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Foundation Models for Information Synthesis of Wireless
+  Communication Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manikanta Kotaru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches to understanding, developing and researching modern
+wireless communication technologies involves time-intensive and arduous process
+of sifting through numerous webpages and technical specification documents,
+gathering the required information and synthesizing it. This paper presents
+NextGen Communications Copilot, a conversational artificial intelligence tool
+for information synthesis of wireless communication specifications. The system
+builds on top of recent advancements in foundation models and consists of three
+key additional components: a domain-specific database, a context extractor, and
+a feedback mechanism. The system appends user queries with concise and
+query-dependent contextual information extracted from a database of wireless
+technical specifications and incorporates tools for expert feedback and data
+contributions. On evaluation using a benchmark dataset of queries and reference
+responses created by subject matter experts, the system demonstrated more
+relevant and accurate answers with an average BLEU score and BERTScore
+F1-measure of 0.37 and 0.79 respectively compared to the corresponding values
+of 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Spatiotemporal Features of Online Food Recommendation
+  Service <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaochuan Lin, Jiayan Pei, Taotao Zhou, Hengxu He, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online Food Recommendation Service (OFRS) has remarkable spatiotemporal
+characteristics and the advantage of being able to conveniently satisfy users'
+needs in a timely manner. There have been a variety of studies that have begun
+to explore its spatiotemporal properties, but a comprehensive and in-depth
+analysis of the OFRS spatiotemporal features is yet to be conducted. Therefore,
+this paper studies the OFRS based on three questions: how spatiotemporal
+features play a role; why self-attention cannot be used to model the
+spatiotemporal sequences of OFRS; and how to combine spatiotemporal features to
+improve the efficiency of OFRS. Firstly, through experimental analysis, we
+systemically extracted the spatiotemporal features of OFRS, identified the most
+valuable features and designed an effective combination method. Secondly, we
+conducted a detailed analysis of the spatiotemporal sequences, which revealed
+the shortcomings of self-attention in OFRS, and proposed a more optimized
+spatiotemporal sequence method for replacing self-attention. In addition, we
+also designed a Dynamic Context Adaptation Model to further improve the
+efficiency and performance of OFRS. Through the offline experiments on two
+large datasets and online experiments for a week, the feasibility and
+superiority of our model were proven.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Granularity Attention Model for Group Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianye Ji, Jiayan Pei, Shaochuan Lin, Taotao Zhou, Hengxu He, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group recommendation provides personalized recommendations to a group of
+users based on their shared interests, preferences, and characteristics.
+Current studies have explored different methods for integrating individual
+preferences and making collective decisions that benefit the group as a whole.
+However, most of them heavily rely on users with rich behavior and ignore
+latent preferences of users with relatively sparse behavior, leading to
+insufficient learning of individual interests. To address this challenge, we
+present the Multi-Granularity Attention Model (MGAM), a novel approach that
+utilizes multiple levels of granularity (i.e., subsets, groups, and supersets)
+to uncover group members' latent preferences and mitigate recommendation noise.
+Specially, we propose a Subset Preference Extraction module that enhances the
+representation of users' latent subset-level preferences by incorporating their
+previous interactions with items and utilizing a hierarchical mechanism.
+Additionally, our method introduces a Group Preference Extraction module and a
+Superset Preference Extraction module, which explore users' latent preferences
+on two levels: the group-level, which maintains users' original preferences,
+and the superset-level, which includes group-group exterior information. By
+incorporating the subset-level embedding, group-level embedding, and
+superset-level embedding, our proposed method effectively reduces group
+recommendation noise across multiple granularities and comprehensively learns
+individual interests. Extensive offline and online experiments have
+demonstrated the superiority of our method in terms of performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose
+  Recommendation System? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Pesaranghader, Touqir Sajed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past two decades, recommendation systems (RSs) have used machine
+learning (ML) solutions to recommend items, e.g., movies, books, and
+restaurants, to clients of a business or an online platform. Recipe
+recommendation, however, has not yet received much attention compared to those
+applications. We introduce RECipe as a multi-purpose recipe recommendation
+framework with a multi-modal knowledge graph (MMKG) backbone. The motivation
+behind RECipe is to go beyond (deep) neural collaborative filtering (NCF) by
+recommending recipes to users when they query in natural language or by
+providing an image. RECipe consists of 3 subsystems: (1) behavior-based
+recommender, (2) review-based recommender, and (3) image-based recommender.
+Each subsystem relies on the embedding representations of entities and
+relations in the graph. We first obtain (pre-trained) embedding representations
+of textual entities, such as reviews or ingredients, from a fine-tuned model of
+Microsoft's MPNet. We initialize the weights of the entities with these
+embeddings to train our knowledge graph embedding (KGE) model. For the visual
+component, i.e., recipe images, we develop a KGE-Guided variational autoencoder
+(KG-VAE) to learn the distribution of images and their latent representations.
+Once KGE and KG-VAE models are fully trained, we use them as a multi-purpose
+recommendation framework. For benchmarking, we created two knowledge graphs
+(KGs) from public datasets on Kaggle for recipe recommendation. Our experiments
+show that the KGE models have comparable performance to the neural solutions.
+We also present pre-trained NLP embeddings to address important applications
+such as zero-shot inference for new users (or the cold start problem) and
+conditional recommendation with respect to recipe categories. We eventually
+demonstrate the application of RECipe in a multi-purpose recommendation
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning To Rank Diversely At Airbnb 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malay Haldar, Mustafa Abdool, Liwei He, Dillon Davis, Huiji Gao, Sanjeev Katariya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Airbnb is a two-sided marketplace, bringing together hosts who own listings
+for rent, with prospective guests from around the globe. Applying neural
+network-based learning to rank techniques has led to significant improvements
+in matching guests with hosts. These improvements in ranking were driven by a
+core strategy: order the listings by their estimated booking probabilities,
+then iterate on techniques to make these booking probability estimates more and
+more accurate. Embedded implicitly in this strategy was an assumption that the
+booking probability of a listing could be determined independently of other
+listings in search results. In this paper we discuss how this assumption,
+pervasive throughout the commonly-used learning to rank frameworks, is false.
+We provide a theoretical foundation correcting this assumption, followed by
+efficient neural network architectures based on the theory. Explicitly
+accounting for possible similarities between listings, and reducing them to
+diversify the search results generated strong positive impact. We discuss these
+metric wins as part of the online A/B tests of the theory. Our method provides
+a practical way to diversify search results for large-scale production ranking
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Search ranking, Diversity, e-commerce</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT4Rec: Simple yet Effective Consistency Training for Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.06668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.06668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Liu, Xiaoyang Liu, Rongqin Zheng, Lixin Zhang, Xiaobo Liang, Juntao Li, Lijun Wu, Min Zhang, Leyu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation methods play an important role in real-world
+recommender systems. These systems are able to catch user preferences by taking
+advantage of historical records and then performing recommendations.
+Contrastive learning(CL) is a cutting-edge technology that can assist us in
+obtaining informative user representations, but these CL-based models need
+subtle negative sampling strategies, tedious data augmentation methods, and
+heavy hyper-parameters tuning work. In this paper, we introduce another way to
+generate better user representations and recommend more attractive items to
+users. Particularly, we put forward an effective \textbf{C}onsistency
+\textbf{C}onstraint for sequential \textbf{Rec}ommendation(C$^2$-Rec) in which
+only two extra training objectives are used without any structural
+modifications and data augmentation strategies. Substantial experiments have
+been conducted on three benchmark datasets and one real industrial dataset,
+which proves that our proposed method outperforms SOTA models substantially.
+Furthermore, our method needs much less training time than those CL-based
+models. Online AB-test on real-world recommendation systems also achieves
+10.141\% improvement on the click-through rate and 10.541\% increase on the
+average click number per capita. The code is available at
+\url{https://github.com/zhengrongqin/C2-Rec}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Document Layout Annotation: Database and Benchmark in the Domain of
+  Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia, Marcos Grande, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every day, thousands of digital documents are generated with useful
+information for companies, public organizations, and citizens. Given the
+impossibility of processing them manually, the automatic processing of these
+documents is becoming increasingly necessary in certain sectors. However, this
+task remains challenging, since in most cases a text-only based parsing is not
+enough to fully understand the information presented through different
+components of varying significance. In this regard, Document Layout Analysis
+(DLA) has been an interesting research field for many years, which aims to
+detect and classify the basic components of a document. In this work, we used a
+procedure to semi-automatically annotate digital documents with different
+layout labels, including 4 basic layout blocks and 4 text categories. We apply
+this procedure to collect a novel database for DLA in the public affairs
+domain, using a set of 24 data sources from the Spanish Administration. The
+database comprises 37.9K documents with more than 441K document pages, and more
+than 8M labels associated to 8 layout block units. The results of our
+experiments validate the proposed text labeling procedure with accuracy up to
+99%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for
+  Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PCDF: A Parallel-Computing Distributed Framework for Sponsored Search
+  Advertising Serving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12893v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12893v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xu, Hao Qi, Kunyao Wang, Pei Wang, Guowei Zhang, Congcong Liu, Junsheng Jin, Xiwei Zhao, Zhangang Lin, Jinghe Hu, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional online advertising systems for sponsored search follow a cascade
+paradigm with retrieval, pre-ranking,ranking, respectively. Constrained by
+strict requirements on online inference efficiency, it tend to be difficult to
+deploy useful but computationally intensive modules in the ranking stage.
+Moreover, ranking models currently used in the industry assume the user click
+only relies on the advertisements itself, which results in the ranking stage
+overlooking the impact of organic search results on the predicted
+advertisements (ads). In this work, we propose a novel framework
+PCDF(Parallel-Computing Distributed Framework), allowing to split the
+computation cost into three parts and to deploy them in the pre-module in
+parallel with the retrieval stage, the middle-module for ranking ads, and the
+post-module for re-ranking ads with external items. Our PCDF effectively
+reduces the overall inference latency compared with the classic framework. The
+whole module is end-to-end offline training and adapt for the online learning
+paradigm. To our knowledge, we are the first to propose an end-to-end solution
+for online training and deployment on complex CTR models from the system
+framework side.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">121</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When More is Less: Incorporating Additional <span class="highlight-title">Dataset</span>s Can Hurt
+  Performance By Introducing Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rhys Compton, Lily Zhang, Aahlad Puli, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, incorporating more data is often seen as a reliable
+strategy for improving model performance; this work challenges that notion by
+demonstrating that the addition of external datasets in many cases can hurt the
+resulting model's performance. In a large-scale empirical study across
+combinations of four different open-source chest x-ray datasets and 9 different
+labels, we demonstrate that in 43% of settings, a model trained on data from
+two hospitals has poorer worst group accuracy over both hospitals than a model
+trained on just a single hospital's data. This surprising result occurs even
+though the added hospital makes the training distribution more similar to the
+test distribution. We explain that this phenomenon arises from the spurious
+correlation that emerges between the disease and hospital, due to
+hospital-specific image artifacts. We highlight the trade-off one encounters
+when training on multiple datasets, between the obvious benefit of additional
+data and insidious cost of the introduced spurious correlation. In some cases,
+balancing the dataset can remove the spurious correlation and improve
+performance, but it is not always an effective strategy. We contextualize our
+results within the literature on spurious correlations to help explain these
+outcomes. Our experiments underscore the importance of exercising caution when
+selecting training data for machine learning models, especially in settings
+where there is a risk of spurious correlations such as with medical imaging.
+The risks outlined highlight the need for careful data selection and model
+evaluation in future research and practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MLHC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sewon Min, Suchin Gururangan, Eric Wallace, Hannaneh Hajishirzi, Noah A. Smith, Luke Zettlemoyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The legality of training language models (LMs) on copyrighted or otherwise
+restricted data is under intense debate. However, as we show, model performance
+significantly degrades if trained only on low-risk text (e.g., out-of-copyright
+books or government documents), due to its limited size and domain coverage. We
+present SILO, a new language model that manages this risk-performance tradeoff
+during inference. SILO is built by (1) training a parametric LM on Open License
+Corpus (OLC), a new corpus we curate with 228B tokens of public domain and
+permissively licensed text and (2) augmenting it with a more general and easily
+modifiable nonparametric datastore (e.g., containing copyrighted books or news)
+that is only queried during inference. The datastore allows use of high-risk
+data without training on it, supports sentence-level data attribution, and
+enables data producers to opt out from the model by removing content from the
+store. These capabilities can foster compliance with data-use regulations such
+as the fair use doctrine in the United States and the GDPR in the European
+Union. Our experiments show that the parametric LM struggles on domains not
+covered by OLC. However, access to the datastore greatly improves out of domain
+performance, closing 90% of the performance gap with an LM trained on the Pile,
+a more diverse corpus with mostly high-risk text. We also analyze which
+nonparametric approach works best, where the remaining errors lie, and how
+performance scales with datastore size. Our results suggest that it is possible
+to build high quality language models while mitigating their legal risk.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages; 6 figures. Code, models, and data available at
+  https://github.com/kernelmachine/silo-lm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learning Operators to Optimality from Multi-Task Non-IID Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas T. C. K. Zhang, Leonardo F. Toso, James Anderson, Nikolai Matni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A powerful concept behind much of the recent progress in machine learning is
+the extraction of common features across data from heterogeneous sources or
+tasks. Intuitively, using all of one's data to learn a common representation
+function benefits both computational effort and statistical generalization by
+leaving a smaller number of parameters to fine-tune on a given task. Toward
+theoretically grounding these merits, we propose a general setting of
+recovering linear operators $M$ from noisy vector measurements $y = Mx + w$,
+where the covariates $x$ may be both non-i.i.d. and non-isotropic. We
+demonstrate that existing isotropy-agnostic meta-learning approaches incur
+biases on the representation update, which causes the scaling of the noise
+terms to lose favorable dependence on the number of source tasks. This in turn
+can cause the sample complexity of representation learning to be bottlenecked
+by the single-task data size. We introduce an adaptation, $\texttt{De-bias &
+Feature-Whiten}$ ($\texttt{DFW}$), of the popular alternating
+minimization-descent (AMD) scheme proposed in Collins et al., (2021), and
+establish linear convergence to the optimal representation with noise level
+scaling down with the $\textit{total}$ source data size. This leads to
+generalization bounds on the same order as an oracle empirical risk minimizer.
+We verify the vital importance of $\texttt{DFW}$ on various numerical
+simulations. In particular, we show that vanilla alternating-minimization
+descent fails catastrophically even for iid, but mildly non-isotropic data. Our
+analysis unifies and generalizes prior work, and provides a flexible framework
+for a wider range of applications, such as in controls and dynamical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep-Learning Method Using Auto-encoder and Generative Adversarial
+  Network for Anomaly Detection on Ancient Stone Stele Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Liu, Yuning Wang, Cheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of natural deterioration and man-made damage on the
+surfaces of ancient stele in the first instance is essential for their
+preventive conservation. Existing methods for cultural heritage preservation
+are not able to achieve this goal perfectly due to the difficulty of balancing
+accuracy, efficiency, timeliness, and cost. This paper presents a deep-learning
+method to automatically detect above mentioned emergencies on ancient stone
+stele in real time, employing autoencoder (AE) and generative adversarial
+network (GAN). The proposed method overcomes the limitations of existing
+methods by requiring no extensive anomaly samples while enabling comprehensive
+detection of unpredictable anomalies. the method includes stages of monitoring,
+data acquisition, pre-processing, model structuring, and post-processing.
+Taking the Longmen Grottoes' stone steles as a case study, an unsupervised
+learning model based on AE and GAN architectures is proposed and validated with
+a reconstruction accuracy of 99.74\%. The method's evaluation revealed the
+proficient detection of seven artificially designed anomalies and demonstrated
+precision and reliability without false alarms. This research provides novel
+ideas and possibilities for the application of deep learning in the field of
+cultural heritage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from
+  Optical Satellite Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Yu Zhang, Shiying Wang, Lei Jin, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical satellite images are a critical data source; however, cloud cover
+often compromises their quality, hindering image applications and analysis.
+Consequently, effectively removing clouds from optical satellite images has
+emerged as a prominent research direction. While recent advancements in cloud
+removal primarily rely on generative adversarial networks, which may yield
+suboptimal image quality, diffusion models have demonstrated remarkable success
+in diverse image-generation tasks, showcasing their potential in addressing
+this challenge. This paper presents a novel framework called DiffCR, which
+leverages conditional guided diffusion with deep convolutional networks for
+high-performance cloud removal for optical satellite imagery. Specifically, we
+introduce a decoupled encoder for conditional image feature extraction,
+providing a robust color representation to ensure the close similarity of
+appearance information between the conditional input and the synthesized
+output. Moreover, we propose a novel and efficient time and condition fusion
+block within the cloud removal model to accurately simulate the correspondence
+between the appearance in the conditional image and the target image at a low
+computational cost. Extensive experimental evaluations on two commonly used
+benchmark datasets demonstrate that DiffCR consistently achieves
+state-of-the-art performance on all metrics, with parameter and computational
+complexities amounting to only 5.1% and 5.4%, respectively, of those previous
+best methods. The source code, pre-trained models, and all the experimental
+results will be publicly available at https://github.com/XavierJiezou/DiffCR
+upon the paper's acceptance of this work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Invariant Learning with Randomized Linear Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Cotta, Gal Yehuda, Assaf Schuster, Chris J. Maddison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing models that are both expressive and preserve known invariances of
+tasks is an increasingly hard problem. Existing solutions tradeoff invariance
+for computational or memory resources. In this work, we show how to leverage
+randomness and design models that are both expressive and invariant but use
+less resources. Inspired by randomized algorithms, our key insight is that
+accepting probabilistic notions of universal approximation and invariance can
+reduce our resource requirements. More specifically, we propose a class of
+binary classification models called Randomized Linear Classifiers (RLCs). We
+give parameter and sample size conditions in which RLCs can, with high
+probability, approximate any (smooth) function while preserving invariance to
+compact group transformations. Leveraging this result, we design three RLCs
+that are provably probabilistic invariant for classification tasks over sets,
+graphs, and spherical data. We show how these models can achieve probabilistic
+invariance and universality using less resources than (deterministic) neural
+networks and their invariant counterparts. Finally, we empirically demonstrate
+the benefits of this new class of models on invariant tasks where deterministic
+invariant neural networks are known to struggle.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XGBD: Explanation-Guided Graph Backdoor Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Guan, Mengnan Du, Ninghao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks pose a significant security risk to graph learning models.
+Backdoors can be embedded into the target model by inserting backdoor triggers
+into the training dataset, causing the model to make incorrect predictions when
+the trigger is present. To counter backdoor attacks, backdoor detection has
+been proposed. An emerging detection strategy in the vision and NLP domains is
+based on an intriguing phenomenon: when training models on a mixture of
+backdoor and clean samples, the loss on backdoor samples drops significantly
+faster than on clean samples, allowing backdoor samples to be easily detected
+by selecting samples with the lowest loss values. However, the ignorance of
+topological feature information on graph data limits its detection
+effectiveness when applied directly to the graph domain. To this end, we
+propose an explanation-guided backdoor detection method to take advantage of
+the topological information. Specifically, we train a helper model on the graph
+dataset, feed graph samples into the model, and then adopt explanation methods
+to attribute model prediction to an important subgraph. We observe that
+backdoor samples have distinct attribution distribution than clean samples, so
+the explanatory subgraph could serve as more discriminative features for
+detecting backdoor samples. Comprehensive experiments on multiple popular
+datasets and attack methods demonstrate the effectiveness and explainability of
+our method. Our code is available:
+https://github.com/GuanZihan/GNN_backdoor_detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event Abstraction for Enterprise Collaboration Systems to Support Social
+  Process Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Blatt, Patrick Delfmann, Petra Schubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One aim of Process Mining (PM) is the discovery of process models from event
+logs of information systems. PM has been successfully applied to
+process-oriented enterprise systems but is less suited for communication- and
+document-oriented Enterprise Collaboration Systems (ECS). ECS event logs are
+very fine-granular and PM applied to their logs results in spaghetti models. A
+common solution for this is event abstraction, i.e., converting low-level logs
+into more abstract high-level logs before running discovery algorithms. ECS
+logs come with special characteristics that have so far not been fully
+addressed by existing event abstraction approaches. We aim to close this gap
+with a tailored ECS event abstraction (ECSEA) approach that trains a model by
+comparing recorded actual user activities (high-level traces) with the
+system-generated low-level traces (extracted from the ECS). The model allows us
+to automatically convert future low-level traces into an abstracted high-level
+log that can be used for PM. Our evaluation shows that the algorithm produces
+accurate results. ECSEA is a preprocessing method that is essential for the
+interpretation of collaborative work activity in ECS, which we call Social
+Process Mining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation-Based Unsupervised Domain Adaptation In Medical
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Nørgaard Llambias, Mads Nielsen, Mostafa Mehdipour Ghazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based models in medical imaging often struggle to generalize
+effectively to new scans due to data heterogeneity arising from differences in
+hardware, acquisition parameters, population, and artifacts. This limitation
+presents a significant challenge in adopting machine learning models for
+clinical practice. We propose an unsupervised method for robust domain
+adaptation in brain MRI segmentation by leveraging MRI-specific augmentation
+techniques. To evaluate the effectiveness of our method, we conduct extensive
+experiments across diverse datasets, modalities, and segmentation tasks,
+comparing against the state-of-the-art methods. The results show that our
+proposed approach achieves high accuracy, exhibits broad applicability, and
+showcases remarkable robustness against domain shift in various tasks,
+surpassing the state-of-the-art performance in the majority of cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Effect of Counterfactual Explanations on Trust and
+  Reliance on AI for Human-AI Collaborative Clinical Decision Making <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Hun Lee, Chong Jun Chew
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) is increasingly being considered to assist human
+decision-making in high-stake domains (e.g. health). However, researchers have
+discussed an issue that humans can over-rely on wrong suggestions of the AI
+model instead of achieving human AI complementary performance. In this work, we
+utilized salient feature explanations along with what-if, counterfactual
+explanations to make humans review AI suggestions more analytically to reduce
+overreliance on AI and explored the effect of these explanations on trust and
+reliance on AI during clinical decision-making. We conducted an experiment with
+seven therapists and ten laypersons on the task of assessing post-stroke
+survivors' quality of motion, and analyzed their performance, agreement level
+on the task, and reliance on AI without and with two types of AI explanations.
+Our results showed that the AI model with both salient features and
+counterfactual explanations assisted therapists and laypersons to improve their
+performance and agreement level on the task when `right' AI outputs are
+presented. While both therapists and laypersons over-relied on `wrong' AI
+outputs, counterfactual explanations assisted both therapists and laypersons to
+reduce their over-reliance on `wrong' AI outputs by 21\% compared to salient
+feature explanations. Specifically, laypersons had higher performance degrades
+by 18.0 f1-score with salient feature explanations and 14.0 f1-score with
+counterfactual explanations than therapists with performance degrades of 8.6
+and 2.8 f1-scores respectively. Our work discusses the potential of
+counterfactual explanations to better estimate the accuracy of an AI model and
+reduce over-reliance on `wrong' AI outputs and implications for improving
+human-AI collaborative decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM CSCW 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pelta: Shielding <span class="highlight-title">Transformer</span>s to Mitigate Evasion Attacks in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Queyrut, Yérom-David Bromberg, Valerio Schiavoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main premise of federated learning is that machine learning model updates
+are computed locally, in particular to preserve user data privacy, as those
+never leave the perimeter of their device. This mechanism supposes the general
+model, once aggregated, to be broadcast to collaborating and non malicious
+nodes. However, without proper defenses, compromised clients can easily probe
+the model inside their local memory in search of adversarial examples. For
+instance, considering image-based applications, adversarial examples consist of
+imperceptibly perturbed images (to the human eye) misclassified by the local
+model, which can be later presented to a victim node's counterpart model to
+replicate the attack. To mitigate such malicious probing, we introduce Pelta, a
+novel shielding mechanism leveraging trusted hardware. By harnessing the
+capabilities of Trusted Execution Environments (TEEs), Pelta masks part of the
+back-propagation chain rule, otherwise typically exploited by attackers for the
+design of malicious samples. We evaluate Pelta on a state of the art ensemble
+model and demonstrate its effectiveness against the Self Attention Gradient
+adversarial Attack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super
+  Learner Equation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Vowels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference is a crucial goal of science, enabling researchers to arrive
+at meaningful conclusions regarding the predictions of hypothetical
+interventions using observational data. Path models, Structural Equation Models
+(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to
+unambiguously specify assumptions regarding the causal structure underlying a
+phenomenon. Unlike DAGs, which make very few assumptions about the functional
+and parametric form, SEM assumes linearity. This can result in functional
+misspecification which prevents researchers from undertaking reliable effect
+size estimation. In contrast, we propose Super Learner Equation Modeling, a
+path modeling technique integrating machine learning Super Learner ensembles.
+We empirically demonstrate its ability to provide consistent and unbiased
+estimates of causal effects, its competitive performance for linear models when
+compared with SEM, and highlight its superiority over SEM when dealing with
+non-linear relationships. We provide open-source code, and a tutorial notebook
+with example usage, accentuating the easy-to-use nature of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate, Explainable, and Private Models: Providing Recourse While
+  Minimizing Training Data Leakage <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Huang, Chelse Swoopes, Christina Xiao, Jiaqi Ma, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are increasingly utilized across impactful domains to
+predict individual outcomes. As such, many models provide algorithmic recourse
+to individuals who receive negative outcomes. However, recourse can be
+leveraged by adversaries to disclose private information. This work presents
+the first attempt at mitigating such attacks. We present two novel methods to
+generate differentially private recourse: Differentially Private Model (DPM)
+and Laplace Recourse (LR). Using logistic regression classifiers and real world
+and synthetic datasets, we find that DPM and LR perform well in reducing what
+an adversary can infer, especially at low FPR. When training dataset size is
+large enough, we find particular success in preventing privacy leakage while
+maintaining model and recourse accuracy with our novel LR method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of The Second Workshop on New Frontiers in Adversarial
+  Machine Learning (AdvML-Frontiers @ ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLHF-Blender: A Configurable Interactive Interface for Learning from
+  Diverse Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannick Metz, David Lindner, Raphaël Baur, Daniel Keim, Mennatallah El-Assady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To use reinforcement learning from human feedback (RLHF) in practical
+applications, it is crucial to learn reward models from diverse sources of
+human feedback and to consider human factors involved in providing feedback of
+different types. However, the systematic study of learning from diverse types
+of feedback is held back by limited standardized tooling available to
+researchers. To bridge this gap, we propose RLHF-Blender, a configurable,
+interactive interface for learning from human feedback. RLHF-Blender provides a
+modular experimentation framework and implementation that enables researchers
+to systematically investigate the properties and qualities of human feedback
+for reward learning. The system facilitates the exploration of various feedback
+types, including demonstrations, rankings, comparisons, and natural language
+instructions, as well as studies considering the impact of human factors on
+their effectiveness. We discuss a set of concrete research opportunities
+enabled by RLHF-Blender. More information is available at
+https://rlhfblender.info/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Multi-agent Bandits: Distributed Algorithms with Optimal
+  Individual Regret and Constant Communication Costs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Yang, Xuchuang Wang, Mohammad Hajiesmaili, Lijun Zhang, John C. S. Lui, Don Towsley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been extensive study of cooperative multi-agent
+multi-armed bandits where a set of distributed agents cooperatively play the
+same multi-armed bandit game. The goal is to develop bandit algorithms with the
+optimal group and individual regrets and low communication between agents. The
+prior work tackled this problem using two paradigms: leader-follower and fully
+distributed algorithms. Prior algorithms in both paradigms achieve the optimal
+group regret. The leader-follower algorithms achieve constant communication
+costs but fail to achieve optimal individual regrets. The state-of-the-art
+fully distributed algorithms achieve optimal individual regrets but fail to
+achieve constant communication costs. This paper presents a simple yet
+effective communication policy and integrates it into a learning algorithm for
+cooperative bandits. Our algorithm achieves the best of both paradigms: optimal
+individual regret and constant communication costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Model Inversion Eavesdropping Attack in Semantic Communication
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Chen, Qianqian Yang, Zhiguo Shi, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, semantic communication has been a popular research topic for
+its superiority in communication efficiency. As semantic communication relies
+on deep learning to extract meaning from raw messages, it is vulnerable to
+attacks targeting deep learning models. In this paper, we introduce the model
+inversion eavesdropping attack (MIEA) to reveal the risk of privacy leaks in
+the semantic communication system. In MIEA, the attacker first eavesdrops the
+signal being transmitted by the semantic communication system and then performs
+model inversion attack to reconstruct the raw message, where both the white-box
+and black-box settings are considered. Evaluation results show that MIEA can
+successfully reconstruct the raw message with good quality under different
+channel conditions. We then propose a defense method based on random
+permutation and substitution to defend against MIEA in order to achieve secure
+semantic communication. Our experimental results demonstrate the effectiveness
+of the proposed defense method in preventing MIEA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2023 IEEE Global Communications Conference (GLOBECOM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of the wav2vec 2.0 Feature Extractor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Vieting, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems typically use handcrafted feature
+extraction pipelines. To avoid their inherent information loss and to achieve
+more consistent modeling from speech to transcribed text, neural raw waveform
+feature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,
+which has recently gained large popularity, uses a convolutional FE which
+operates directly on the speech waveform. However, it is not yet studied
+extensively in the literature. In this work, we study its capability to replace
+the standard feature extraction methods in a connectionist temporal
+classification (CTC) ASR model and compare it to an alternative neural FE. We
+show that both are competitive with traditional FEs on the LibriSpeech
+benchmark and analyze the effect of the individual components. Furthermore, we
+analyze the learned filters and show that the most important information for
+the ASR system is obtained by a set of bandpass filters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ITG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Alignment: Chat with Vanilla Language Models Before
+  Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we explore inference-time alignment through in-context
+learning. We consider a vanilla pretrained language model Llama-2 before any
+fine-tuning and retrieve an average of 9 demonstration alignment examples when
+the model is prompted to follow chat-style instructions. Compared to direct
+prompting, the in-context alignment without changing model weights leads to a
+7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making
+the vanilla language model comparable to strong baselines with alignment
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teacher-Student Architecture for Knowledge Distillation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Xuan Li, Dan Liu, Haolun Wu, Xi Chen, Ju Wang, Xue Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Deep neural networks (DNNs) have shown a strong capacity to solve
+large-scale problems in many areas, such DNNs are hard to be deployed in
+real-world systems due to their voluminous parameters. To tackle this issue,
+Teacher-Student architectures were proposed, where simple student networks with
+a few parameters can achieve comparable performance to deep teacher networks
+with many parameters. Recently, Teacher-Student architectures have been
+effectively and widely embraced on various knowledge distillation (KD)
+objectives, including knowledge compression, knowledge expansion, knowledge
+adaptation, and knowledge enhancement. With the help of Teacher-Student
+architectures, current studies are able to achieve multiple distillation
+objectives through lightweight and generalized student networks. Different from
+existing KD surveys that primarily focus on knowledge compression, this survey
+first explores Teacher-Student architectures across multiple distillation
+objectives. This survey presents an introduction to various knowledge
+representations and their corresponding optimization objectives. Additionally,
+we provide a systematic overview of Teacher-Student architectures with
+representative learning algorithms and effective distillation schemes. This
+survey also summarizes recent applications of Teacher-Student architectures
+across multiple purposes, including classification, recognition, generation,
+ranking, and regression. Lastly, potential research directions in KD are
+investigated, focusing on architecture design, knowledge quality, and
+theoretical studies of regression-based learning, respectively. Through this
+comprehensive survey, industry practitioners and the academic community can
+gain valuable insights and guidelines for effectively designing, learning, and
+applying Teacher-Student architectures on various distillation objectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages. arXiv admin note: substantial text overlap with
+  arXiv:2210.17332</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Veysel Cagatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces BarlowRL, a data-efficient reinforcement learning agent
+that combines the Barlow Twins self-supervised learning framework with DER
+(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its
+contrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids
+dimensional collapse by enforcing information spread to the whole space. This
+helps RL algorithms to utilize uniformly spread state representation that
+eventually results in a remarkable performance. The integration of Barlow Twins
+with DER enhances data efficiency and achieves superior performance in the RL
+tasks. BarlowRL demonstrates the potential of incorporating self-supervised
+learning techniques to improve RL algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SDLFormer: A Sparse and Dense Locality-enhanced <span class="highlight-title">Transformer</span> for
+  Accelerated MR Image Reconstruction <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul G. S., Sriprabha Ramnarayanan, Mohammad Al Fahim, Keerthi Ram, Preejith S. P, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as viable alternatives to convolutional neural
+networks owing to their ability to learn non-local region relationships in the
+spatial domain. The self-attention mechanism of the transformer enables
+transformers to capture long-range dependencies in the images, which might be
+desirable for accelerated MRI image reconstruction as the effect of
+undersampling is non-local in the image domain. Despite its computational
+efficiency, the window-based transformers suffer from restricted receptive
+fields as the dependencies are limited to within the scope of the image
+windows. We propose a window-based transformer network that integrates dilated
+attention mechanism and convolution for accelerated MRI image reconstruction.
+The proposed network consists of dilated and dense neighborhood attention
+transformers to enhance the distant neighborhood pixel relationship and
+introduce depth-wise convolutions within the transformer module to learn
+low-level translation invariant features for accelerated MRI image
+reconstruction. The proposed model is trained in a self-supervised manner. We
+perform extensive experiments for multi-coil MRI acceleration for coronal PD,
+coronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in
+self-supervised learning based on k-space splitting. We compare our method
+against other reconstruction architectures and the parallel domain
+self-supervised learning baseline. Results show that the proposed model
+exhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in
+SSIM on average over other architectures (ii) around 1.44 dB in PSNR and around
+0.029 in SSIM over parallel domain self-supervised learning. The code is
+available at https://github.com/rahul-gs-16/sdlformer.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with
+  noisy and Limited Data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Natural-Language Based Audio Retrieval with PaSST and Large
+  Audio-Caption Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Primus, Khaled Koutini, Gerhard Widmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a text-to-audio-retrieval system based on pre-trained text
+and spectrogram transformers. Our method projects recordings and textual
+descriptions into a shared audio-caption space in which related examples from
+different modalities are close. Through a systematic analysis, we examine how
+each component of the system influences retrieval performance. As a result, we
+identify two key components that play a crucial role in driving performance:
+the self-attention-based audio encoder for audio embedding and the utilization
+of additional human-generated and synthetic data sets during pre-training. We
+further experimented with augmenting ClothoV2 captions with available keywords
+to increase their variety; however, this only led to marginal improvements. Our
+system ranked first in the 2023's DCASE Challenge, and it outperforms the
+current state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to DCASE Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Inference with Reliable Uncertainty Quantification over
+  Wireless Channels via Conformal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiyi Zhu, Matteo Zecchin, Sangwoo Park, Caili Guo, Chunyan Feng, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consider a setting in which devices and a server share a pre-trained model.
+The server wishes to make an inference on a new input given the model. Devices
+have access to data, previously not used for training, and can communicate to
+the server over a common wireless channel. If the devices have no access to the
+new input, can communication from devices to the server enhance the quality of
+the inference decision at the server? Recent work has introduced federated
+conformal prediction (CP), which leverages devices-to-server communication to
+improve the reliability of the server's decision. With federated CP, devices
+communicate to the server information about the loss accrued by the shared
+pre-trained model on the local data, and the server leverages this information
+to calibrate a decision interval, or set, so that it is guaranteed to contain
+the correct answer with a pre-defined target reliability level. Previous work
+assumed noise-free communication, whereby devices can communicate a single real
+number to the server. In this paper, we study for the first time federated CP
+in a wireless setting. We introduce a novel protocol, termed wireless federated
+conformal prediction (WFCP), which builds on type-based multiple access (TBMA)
+and on a novel quantile correction strategy. WFCP is proved to provide formal
+reliability guarantees in terms of coverage of the predicted set produced by
+the server. Using numerical results, we demonstrate the significant advantages
+of WFCP against digital implementations of existing federated CP schemes,
+especially in regimes with limited communication resources and/or large number
+of devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpinionConv: Conversational Product Search with Grounded Opinions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahid Sadiri Javadi, Martin Potthast, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When searching for products, the opinions of others play an important role in
+making informed decisions. Subjective experiences about a product can be a
+valuable source of information. This is also true in sales conversations, where
+a customer and a sales assistant exchange facts and opinions about products.
+However, training an AI for such conversations is complicated by the fact that
+language models do not possess authentic opinions for their lack of real-world
+experience. We address this problem by leveraging product reviews as a rich
+source of product opinions to ground conversational AI in true subjective
+narratives. With OpinionConv, we develop the first conversational AI for
+simulating sales conversations. To validate the generated conversations, we
+conduct several user studies showing that the generated opinions are perceived
+as realistic. Our assessors also confirm the importance of opinions as an
+informative basis for decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Interpretation and Validation of Graph Attention-based
+  Explanations for GNN Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Efimia Panagiotaki, Daniele De Martini, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a methodology for investigating the application of
+semantic attention to enhance the explainability of Graph Neural Network
+(GNN)-based models, introducing semantically-informed perturbations and
+establishing a correlation between predicted feature-importance weights and
+model accuracy. Graph Deep Learning (GDL) has emerged as a promising field for
+tasks like scene interpretation, leveraging flexible graph structures to
+concisely describe complex features and relationships. As traditional
+explainability methods used in eXplainable AI (XAI) cannot be directly applied
+to such structures, graph-specific approaches are introduced. Attention
+mechanisms have demonstrated their efficacy in estimating the importance of
+input features in deep learning models and thus have been previously employed
+to provide feature-based explanations for GNN predictions. Building upon these
+insights, we extend existing attention-based graph-explainability methods
+investigating the use of attention weights as importance indicators of
+semantically sorted feature sets. Through analysing the behaviour of predicted
+attention-weights distribution in correlation with model accuracy, we gain
+valuable insights into feature importance with respect to the behaviour of the
+GNN model. We apply our methodology to a lidar pointcloud estimation model
+successfully identifying key semantic classes that contribute to enhanced
+performance effectively generating reliable post-hoc semantic explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Varying-coefficients for regional quantile via KNN-based LASSO with
+  applications to health outcome study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyoung Park, Eun Ryung Lee, Hyokyoung G. Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Health outcomes, such as body mass index and cholesterol levels, are known to
+be dependent on age and exhibit varying effects with their associated risk
+factors. In this paper, we propose a novel framework for dynamic modeling of
+the associations between health outcomes and risk factors using
+varying-coefficients (VC) regional quantile regression via K-nearest neighbors
+(KNN) fused Lasso, which captures the time-varying effects of age. The proposed
+method has strong theoretical properties, including a tight estimation error
+bound and the ability to detect exact clustered patterns under certain
+regularity conditions. To efficiently solve the resulting optimization problem,
+we develop an alternating direction method of multipliers (ADMM) algorithm. Our
+empirical results demonstrate the efficacy of the proposed method in capturing
+the complex age-dependent associations between health outcomes and their risk
+factors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Sketching for Secure Coded Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neophytos Charalambides, Hessam Mahdavifar, Mert Pilanci, Alfred O. Hero III
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose methods for speeding up linear regression
+distributively, while ensuring security. We leverage randomized sketching
+techniques, and improve straggler resilience in asynchronous systems.
+Specifically, we apply a random orthonormal matrix and then subsample
+\textit{blocks}, to simultaneously secure the information and reduce the
+dimension of the regression problem. In our setup, the transformation
+corresponds to an encoded encryption in an \textit{approximate gradient coding
+scheme}, and the subsampling corresponds to the responses of the non-straggling
+workers; in a centralized coded computing network. This results in a
+distributive \textit{iterative sketching} approach for an $\ell_2$-subspace
+embedding, \textit{i.e.} a new sketch is considered at each iteration. We also
+focus on the special case of the \textit{Subsampled Randomized Hadamard
+Transform}, which we generalize to block sampling; and discuss how it can be
+modified in order to secure the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 7 figures. arXiv admin note: substantial text overlap with
+  arXiv:2201.08522</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Socially Unacceptable Discourse Classification (SUD) through
+  different eyes: "Are we on the same page ?" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Machado Carneiro, Michele Linardi, Julien Longhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Socially Unacceptable Discourse (SUD) characterization and detection
+in online text. We first build and present a novel corpus that contains a large
+variety of manually annotated texts from different online sources used so far
+in state-of-the-art Machine learning (ML) SUD detection solutions. This global
+context allows us to test the generalization ability of SUD classifiers that
+acquire knowledge around the same SUD categories, but from different contexts.
+From this perspective, we can analyze how (possibly) different annotation
+modalities influence SUD learning by discussing open challenges and open
+research directions. We also provide several data insights which can support
+domain experts in the annotation task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual input neural networks for positional sound source localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Grinstein, Vincent W. Neo, Patrick A. Naylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many signal processing applications, metadata may be advantageously used
+in conjunction with a high dimensional signal to produce a desired output. In
+the case of classical Sound Source Localization (SSL) algorithms, information
+from a high dimensional, multichannel audio signals received by many
+distributed microphones is combined with information describing acoustic
+properties of the scene, such as the microphones' coordinates in space, to
+estimate the position of a sound source. We introduce Dual Input Neural
+Networks (DI-NNs) as a simple and effective way to model these two data types
+in a neural network. We train and evaluate our proposed DI-NN on scenarios of
+varying difficulty and realism and compare it against an alternative
+architecture, a classical Least-Squares (LS) method as well as a classical
+Convolutional Recurrent Neural Network (CRNN). Our results show that the DI-NN
+significantly outperforms the baselines, achieving a five times lower
+localization error than the LS method and two times lower than the CRNN in a
+test dataset of real recordings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Assessment of the Performance of Deep Learning Classifiers
+  Reveals a Surprising Lack of Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable and robust evaluation methods are a necessary first step towards
+developing machine learning models that are themselves robust and reliable.
+Unfortunately, current evaluation protocols typically used to assess
+classifiers fail to comprehensively evaluate performance as they tend to rely
+on limited types of test data, and ignore others. For example, using the
+standard test data fails to evaluate the predictions made by the classifier to
+samples from classes it was not trained on. On the other hand, testing with
+data containing samples from unknown classes fails to evaluate how well the
+classifier can predict the labels for known classes. This article advocates
+bench-marking performance using a wide range of different types of data and
+using a single metric that can be applied to all such data types to produce a
+consistent evaluation of performance. Using such a benchmark it is found that
+current deep neural networks, including those trained with methods that are
+believed to produce state-of-the-art robustness, are extremely vulnerable to
+making mistakes on certain types of data. This means that such models will be
+unreliable in real-world scenarios where they may encounter data from many
+different domains, and that they are insecure as they can easily be fooled into
+making the wrong decisions. It is hoped that these results will motivate the
+wider adoption of more comprehensive testing methods that will, in turn, lead
+to the development of more robust machine learning methods in the future.
+  Code is available at:
+\url{https://codeberg.org/mwspratling/RobustnessEvaluation}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Custom Thermodynamics Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoli Chen, Beatrice W. Soh, Zi-En Ooi, Eleonore Vissol-Gaudin, Haijun Yu, Kostya S. Novoselov, Kedar Hippalgaonkar, Qianxiao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most exciting applications of AI is automated scientific discovery
+based on previously amassed data, coupled with restrictions provided by the
+known physical principles, including symmetries and conservation laws. Such
+automated hypothesis creation and verification can assist scientists in
+studying complex phenomena, where traditional physical intuition may fail. Of
+particular importance are complex dynamic systems where their time evolution is
+strongly influenced by varying external parameters. In this paper we develop a
+platform based on a generalised Onsager principle to learn macroscopic
+dynamical descriptions of arbitrary stochastic dissipative systems directly
+from observations of their microscopic trajectories. We focus on systems whose
+complexity and sheer sizes render complete microscopic description impractical,
+and constructing theoretical macroscopic models requires extensive domain
+knowledge or trial-and-error. Our machine learning approach addresses this by
+simultaneously constructing reduced thermodynamic coordinates and interpreting
+the dynamics on these coordinates. We demonstrate our method by studying
+theoretically and validating experimentally, the stretching of long polymer
+chains in an externally applied field. Specifically, we learn three
+interpretable thermodynamic coordinates and build a dynamical landscape of
+polymer stretching, including (1) the identification of stable and transition
+states and (2) the control of the stretching rate. We further demonstrate the
+universality of our approach by applying it to an unrelated problem in a
+different domain: constructing macroscopic dynamics for spatial epidemics,
+showing that our method addresses wide scientific and technological
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable machine learning to enable high-throughput electrical
+  conductivity optimization of doped conjugated polymers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Wei Yoon, Adithya Kumar, Pawan Kumar, Kedar Hippalgaonkar, J Senthilnath, Vijila Chellappan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The combination of high-throughput experimentation techniques and machine
+learning (ML) has recently ushered in a new era of accelerated material
+discovery, enabling the identification of materials with cutting-edge
+properties. However, the measurement of certain physical quantities remains
+challenging to automate. Specifically, meticulous process control,
+experimentation and laborious measurements are required to achieve optimal
+electrical conductivity in doped polymer materials. We propose a ML approach,
+which relies on readily measured absorbance spectra, to accelerate the workflow
+associated with measuring electrical conductivity. The first ML model
+(classification model), accurately classifies samples with a conductivity >~25
+to 100 S/cm, achieving a maximum of 100% accuracy rate. For the subset of
+highly conductive samples, we employed a second ML model (regression model), to
+predict their conductivities, yielding an impressive test R2 value of 0.984. To
+validate the approach, we showed that the models, neither trained on the
+samples with the two highest conductivities of 498 and 506 S/cm, were able to,
+in an extrapolative manner, correctly classify and predict them at satisfactory
+levels of errors. The proposed ML workflow results in an improvement in the
+efficiency of the conductivity measurements by 89% of the maximum achievable
+using our experimental techniques. Furthermore, our approach addressed the
+common challenge of the lack of explainability in ML models by exploiting
+bespoke mathematical properties of the descriptors and ML model, allowing us to
+gain corroborated insights into the spectral influences on conductivity.
+Through this study, we offer an accelerated pathway for optimizing the
+properties of doped polymer materials while showcasing the valuable insights
+that can be derived from purposeful utilization of ML in experimental science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 Pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Asynchronous Evolution of Deep Neural Network Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Liang, Hormoz Shahrzad, Risto Miikkulainen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many evolutionary algorithms (EAs) take advantage of parallel evaluation of
+candidates. However, if evaluation times vary significantly, many worker nodes
+(i.e.,\ compute clients) are idle much of the time, waiting for the next
+generation to be created. Evolutionary neural architecture search (ENAS), a
+class of EAs that optimizes the architecture and hyperparameters of deep neural
+networks, is particularly vulnerable to this issue. This paper proposes a
+generic asynchronous evaluation strategy (AES) that is then adapted to work
+with ENAS. AES increases throughput by maintaining a queue of upto $K$
+individuals ready to be sent to the workers for evaluation and proceeding to
+the next generation as soon as $M<<K$ individuals have been evaluated by the
+workers. A suitable value for $M$ is determined experimentally, balancing
+diversity and efficiency. To showcase the generality and power of AES, it was
+first evaluated in 11-bit multiplexer design (a single-population verifiable
+discovery task) and then scaled up to ENAS for image captioning (a
+multi-population open-ended-optimization task). In both problems, a multifold
+performance improvement was observed, suggesting that AES is a promising method
+for parallelizing the evolution of complex systems with long and variable
+evaluation times, such as those in ENAS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application-Oriented Benchmarking of Quantum Generative Learning Using
+  QUARK 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian J. Kiwit, Marwa Marso, Philipp Ross, Carlos A. Riofrío, Johannes Klepsch, Andre Luckow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarking of quantum machine learning (QML) algorithms is challenging due
+to the complexity and variability of QML systems, e.g., regarding model
+ansatzes, data sets, training techniques, and hyper-parameters selection. The
+QUantum computing Application benchmaRK (QUARK) framework simplifies and
+standardizes benchmarking studies for quantum computing applications. Here, we
+propose several extensions of QUARK to include the ability to evaluate the
+training and deployment of quantum generative models. We describe the updated
+software architecture and illustrate its flexibility through several example
+applications: (1) We trained different quantum generative models using several
+circuit ansatzes, data sets, and data transformations. (2) We evaluated our
+models on GPU and real quantum hardware. (3) We assessed the generalization
+capabilities of our generative models using a broad set of metrics that
+capture, e.g., the novelty and validity of the generated data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Zeroth-Order Optimization using Trajectory-Informed Surrogate
+  Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Shu, Xiaoqiang Lin, Zhongxiang Dai, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated optimization, an emerging paradigm which finds wide real-world
+applications such as federated learning, enables multiple clients (e.g., edge
+devices) to collaboratively optimize a global function. The clients do not
+share their local datasets and typically only share their local gradients.
+However, the gradient information is not available in many applications of
+federated optimization, which hence gives rise to the paradigm of federated
+zeroth-order optimization (ZOO). Existing federated ZOO algorithms suffer from
+the limitations of query and communication inefficiency, which can be
+attributed to (a) their reliance on a substantial number of function queries
+for gradient estimation and (b) the significant disparity between their
+realized local updates and the intended global updates. To this end, we (a)
+introduce trajectory-informed gradient surrogates which is able to use the
+history of function queries during optimization for accurate and
+query-efficient gradient estimation, and (b) develop the technique of adaptive
+gradient correction using these gradient surrogates to mitigate the
+aforementioned disparity. Based on these, we propose the federated zeroth-order
+optimization using trajectory-informed surrogate gradients (FZooS) algorithm
+for query- and communication-efficient federated ZOO. Our FZooS achieves
+theoretical improvements over the existing approaches, which is supported by
+our real-world experiments such as federated black-box adversarial attack and
+federated non-differentiable metric optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Specialized Activation Functions for Physics-informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghui Wang, Lu Lu, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) are known to suffer from
+optimization difficulty. In this work, we reveal the connection between the
+optimization difficulty of PINNs and activation functions. Specifically, we
+show that PINNs exhibit high sensitivity to activation functions when solving
+PDEs with distinct properties. Existing works usually choose activation
+functions by inefficient trial-and-error. To avoid the inefficient manual
+selection and to alleviate the optimization difficulty of PINNs, we introduce
+adaptive activation functions to search for the optimal function when solving
+different problems. We compare different adaptive activation functions and
+discuss their limitations in the context of PINNs. Furthermore, we propose to
+tailor the idea of learning combinations of candidate activation functions to
+the PINNs optimization, which has a higher requirement for the smoothness and
+diversity on learned functions. This is achieved by removing activation
+functions which cannot provide higher-order derivatives from the candidate set
+and incorporating elementary functions with different properties according to
+our prior knowledge about the PDE at hand. We further enhance the search space
+with adaptive slopes. The proposed adaptive activation function can be used to
+solve different PDE systems in an interpretable way. Its effectiveness is
+demonstrated on a series of benchmarks. Code is available at
+https://github.com/LeapLabTHU/AdaAFforPINNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Signatures for Diversity in Probabilistic Trajectory Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Barcelos, Tin Lai, Rafael Oliveira, Paulo Borges, Fabio Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion planning can be cast as a trajectory optimisation problem where a cost
+is minimised as a function of the trajectory being generated. In complex
+environments with several obstacles and complicated geometry, this optimisation
+problem is usually difficult to solve and prone to local minima. However,
+recent advancements in computing hardware allow for parallel trajectory
+optimisation where multiple solutions are obtained simultaneously, each
+initialised from a different starting point. Unfortunately, without a strategy
+preventing two solutions to collapse on each other, naive parallel optimisation
+can suffer from mode collapse diminishing the efficiency of the approach and
+the likelihood of finding a global solution. In this paper we leverage on
+recent advances in the theory of rough paths to devise an algorithm for
+parallel trajectory optimisation that promotes diversity over the range of
+solutions, therefore avoiding mode collapses and achieving better global
+properties. Our approach builds on path signatures and Hilbert space
+representations of trajectories, and connects parallel variational inference
+for trajectory estimation with diversity promoting kernels. We empirically
+demonstrate that this strategy achieves lower average costs than competing
+alternatives on a range of problems, from 2D navigation to robotic manipulators
+operating in cluttered environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConDistFL: Conditional Distillation for Federated Learning from
+  Partially Annotated Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pochuan Wang, Chen Shen, Weichung Wang, Masahiro Oda, Chiou-Shann Fuh, Kensaku Mori, Holger R. Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a generalized segmentation model capable of simultaneously
+delineating multiple organs and diseases is highly desirable. Federated
+learning (FL) is a key technology enabling the collaborative development of a
+model without exchanging training data. However, the limited access to fully
+annotated training data poses a major challenge to training generalizable
+models. We propose "ConDistFL", a framework to solve this problem by combining
+FL with knowledge distillation. Local models can extract the knowledge of
+unlabeled organs and tumors from partially annotated data from the global model
+with an adequately designed conditional probability representation. We validate
+our framework on four distinct partially annotated abdominal CT datasets from
+the MSD and KiTS19 challenges. The experimental results show that the proposed
+framework significantly outperforms FedAvg and FedOpt baselines. Moreover, the
+performance on an external test dataset demonstrates superior generalizability
+compared to models trained on each dataset separately. Our ablation study
+suggests that ConDistFL can perform well without frequent aggregation, reducing
+the communication cost of FL. Our implementation will be available at
+https://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Adversarial Robustness in Low-Label Regime via Adaptively
+  Weighted Regularization and Knowledge Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Yang, Insung Kong, Yongdai Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial robustness is a research area that has recently received a lot of
+attention in the quest for trustworthy artificial intelligence. However, recent
+works on adversarial robustness have focused on supervised learning where it is
+assumed that labeled data is plentiful. In this paper, we investigate
+semi-supervised adversarial training where labeled data is scarce. We derive
+two upper bounds for the robust risk and propose a regularization term for
+unlabeled data motivated by these two upper bounds. Then, we develop a
+semi-supervised adversarial training algorithm that combines the proposed
+regularization term with knowledge distillation using a semi-supervised teacher
+(i.e., a teacher model trained using a semi-supervised learning algorithm). Our
+experiments show that our proposed algorithm achieves state-of-the-art
+performance with significant margins compared to existing algorithms. In
+particular, compared to supervised learning algorithms, performance of our
+proposed algorithm is not much worse even when the amount of labeled data is
+very small. For example, our algorithm with only 8\% labeled data is comparable
+to supervised adversarial training algorithms that use all labeled data, both
+in terms of standard and robust accuracies on CIFAR-10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages - Manuscript, 6 pages - Appendix, Accepted in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Improving Predictive Risk Modelling for New Zealand's Child
+  Welfare System Using Clustering Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Barmomanesh, Victor Miranda-Soberanis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The combination of clinical judgement and predictive risk models crucially
+assist social workers to segregate children at risk of maltreatment and decide
+when authorities should intervene. Predictive risk modelling to address this
+matter has been initiated by several governmental welfare authorities worldwide
+involving administrative data and machine learning algorithms. While previous
+studies have investigated risk factors relating to child maltreatment, several
+gaps remain as to understanding how such risk factors interact and whether
+predictive risk models perform differently for children with different
+features. By integrating Principal Component Analysis and K-Means clustering,
+this paper presents initial findings of our work on the identification of such
+features as well as their potential effect on current risk modelling
+frameworks. This approach allows examining existent, unidentified yet, clusters
+of New Zealand (NZ) children reported with care and protection concerns, as
+well as to analyse their inner structure, and evaluate the performance of
+prediction models trained cluster wise. We aim to discover the extent of
+clustering degree required as an early step in the development of predictive
+risk models for child maltreatment and so enhance the accuracy of such models
+intended for use by child protection authorities. The results from testing
+LASSO logistic regression models trained on identified clusters revealed no
+significant difference in their performance. The models, however, performed
+slightly better for two clusters including younger children. our results
+suggest that separate models might need to be developed for children of certain
+age to gain additional control over the error rates and to improve model
+accuracy. While results are promising, more evidence is needed to draw
+definitive conclusions, and further investigation is necessary.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Models for Anomaly Detection and Design-Space Dimensionality
+  Reduction in Shape Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danny D'Agostino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work presents a novel approach to shape optimization, that has the
+twofold objective to improve the efficiency of global optimization algorithms
+while promoting the generation of high-quality designs during the optimization
+process free of geometrical anomalies. This is accomplished by reducing the
+number of the original design variables defining a new reduced subspace where
+the geometrical variance is maximized and modeling the underlying generative
+process of the data via probabilistic linear latent variable models such as
+Factor Analysis and Probabilistic Principal Component Analysis. We show that
+the data follows approximately a Gaussian distribution when the shape
+modification method is linear and the design variables are sampled uniformly at
+random, due to the direct application of the central limit theorem. The model
+uncertainty is measured in terms of Mahalanobis distance, and the paper
+demonstrates that anomalous designs tend to exhibit a high value of this
+metric. This enables the definition of a new optimization model where anomalous
+geometries are penalized and consequently avoided during the optimization loop.
+The procedure is demonstrated for hull shape optimization of the DTMB 5415
+model, extensively used as an international benchmark for shape optimization
+problems. The global optimization routine is carried out using Bayesian
+Optimization and the DIRECT algorithm. From the numerical results, the new
+framework improves the convergence of global optimization algorithms, while
+only designs with high-quality geometrical features are generated through the
+optimization routine thereby avoiding the wastage of precious computationally
+expensive simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study on TF-IDF feature Weighting Method and its Analysis
+  using Unstructured <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamata Das, Selvakumar K., P. J. A. Alphonse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Classification is the process of categorizing text into the relevant
+categories and its algorithms are at the core of many Natural Language
+Processing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP
+are the most highly used information retrieval methods in text classification.
+We have investigated and analyzed the feature weighting method for text
+classification on unstructured data. The proposed model considered two features
+N-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset
+for sentiment analysis. Then we have used the state-of-the-art classifier to
+validate the method i.e., Support Vector Machine (SVM), Logistic Regression,
+Multinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and
+k-nearest neighbors (KNN). From those two feature extractions, a significant
+increase in feature extraction with TF-IDF features rather than based on
+N-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall
+(93.81%), and F1-score (91.99%) value in Random Forest classifier.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, COLINS-2021, 5th International Conference on
+  Computational Linguistics and Intelligent Systems, April 22-23, 2021,
+  Kharkiv, Ukraine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scope Loss for Imbalanced Classification and RL Exploration <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasham Burhani, Xiao Qi Shi, Jonathan Jaegerman, Daniel Balicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate equivalence between the reinforcement learning problem and the
+supervised classification problem. We consequently equate the exploration
+exploitation trade-off in reinforcement learning to the dataset imbalance
+problem in supervised classification, and find similarities in how they are
+addressed. From our analysis of the aforementioned problems we derive a novel
+loss function for reinforcement learning and supervised classification. Scope
+Loss, our new loss function, adjusts gradients to prevent performance losses
+from over-exploitation and dataset imbalances, without the need for any tuning.
+We test Scope Loss against SOTA loss functions over a basket of benchmark
+reinforcement learning tasks and a skewed classification dataset, and show that
+Scope Loss outperforms other loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures, under review for NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Performance of Semi-Supervised Learning by Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Yang, Kunwoong Kim, Yongdai Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) algorithm is a setup built upon a realistic
+assumption that access to a large amount of labeled data is tough. In this
+study, we present a generalized framework, named SCAR, standing for Selecting
+Clean samples with Adversarial Robustness, for improving the performance of
+recent SSL algorithms. By adversarially attacking pre-trained models with
+semi-supervision, our framework shows substantial advances in classifying
+images. We introduce how adversarial attacks successfully select high-confident
+unlabeled data to be labeled with current predictions. On CIFAR10, three recent
+SSL algorithms with SCAR result in significantly improved image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Pre-Train</span>ing of Large Language Models: How to (re)warm your
+  model? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats L. Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, Timothée Lesort
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are routinely pre-trained on billions of tokens,
+only to restart the process over again once new data becomes available. A much
+cheaper and more efficient solution would be to enable the continual
+pre-training of these models, i.e. updating pre-trained models with new data
+instead of re-training them from scratch. However, the distribution shift
+induced by novel data typically results in degraded performance on past data.
+Taking a step towards efficient continual pre-training, in this work, we
+examine the effect of different warm-up strategies. Our hypothesis is that the
+learning rate must be re-increased to improve compute efficiency when training
+on a new dataset. We study the warmup phase of models pre-trained on the Pile
+(upstream data, 300B tokens) as we continue to pre-train on SlimPajama
+(downstream data, 297B tokens), following a linear warmup and cosine decay
+schedule. We conduct all experiments on the Pythia 410M language model
+architecture and evaluate performance through validation perplexity. We
+experiment with different pre-training checkpoints, various maximum learning
+rates, and various warmup lengths. Our results show that while rewarming models
+first increases the loss on upstream and downstream data, in the longer run it
+improves the downstream performance, outperforming models trained from
+scratch$\unicode{x2013}$even for a large downstream dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization bound for estimating causal effects from observational
+  network data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichu Cai, Zeqin Yang, Weilin Chen, Yuguang Yan, Zhifeng Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating causal effects from observational network data is a significant
+but challenging problem. Existing works in causal inference for observational
+network data lack an analysis of the generalization bound, which can
+theoretically provide support for alleviating the complex confounding bias and
+practically guide the design of learning objectives in a principled manner. To
+fill this gap, we derive a generalization bound for causal effect estimation in
+network scenarios by exploiting 1) the reweighting schema based on joint
+propensity score and 2) the representation learning schema based on Integral
+Probability Metric (IPM). We provide two perspectives on the generalization
+bound in terms of reweighting and representation learning, respectively.
+Motivated by the analysis of the bound, we propose a weighting regression
+method based on the joint propensity score augmented with representation
+learning. Extensive experimental studies on two real-world networks with
+semi-synthetic data demonstrate the effectiveness of our algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding CNN Hidden Neuron Activations using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, de-mystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Multi-Type Multi-Agent Deep Reinforcement Learning for
+  Resource Management in Space-Air-Ground Integrated Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengxi Zhang, Huaze Tang, Wenbo Ding, Xiao-Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Space-Air-Ground Integrated Network (SAGIN), integrating heterogeneous
+devices including low earth orbit (LEO) satellites, unmanned aerial vehicles
+(UAVs), and ground users (GUs), holds significant promise for advancing smart
+city applications. However, resource management of the SAGIN is a challenge
+requiring urgent study in that inappropriate resource management will cause
+poor data transmission, and hence affect the services in smart cities. In this
+paper, we develop a comprehensive SAGIN system that encompasses five distinct
+communication links and propose an efficient cooperative multi-type multi-agent
+deep reinforcement learning (CMT-MARL) method to address the resource
+management issue. The experimental results highlight the efficacy of the
+proposed CMT-MARL, as evidenced by key performance indicators such as the
+overall transmission rate and transmission success rate. These results
+underscore the potential value and feasibility of future implementation of the
+SAGIN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier neural operator for real-time simulation of 3D dynamic urban
+  microclimate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Peng, Shaoxiang Qin, Senwen Yang, Jianchun Wang, Xue Liu,  Liangzhu,  Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global urbanization has underscored the significance of urban microclimates
+for human comfort, health, and building/urban energy efficiency. They
+profoundly influence building design and urban planning as major environmental
+impacts. Understanding local microclimates is essential for cities to prepare
+for climate change and effectively implement resilience measures. However,
+analyzing urban microclimates requires considering a complex array of outdoor
+parameters within computational domains at the city scale over a longer period
+than indoors. As a result, numerical methods like Computational Fluid Dynamics
+(CFD) become computationally expensive when evaluating the impact of urban
+microclimates. The rise of deep learning techniques has opened new
+opportunities for accelerating the modeling of complex non-linear interactions
+and system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown
+to be very promising in accelerating solving the Partial Differential Equations
+(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO
+network for real-time three-dimensional (3D) urban wind field simulation. The
+training and testing data are generated from CFD simulation of the urban area,
+based on the semi-Lagrangian approach and fractional stepping method to
+simulate urban microclimate features for modeling large-scale urban problems.
+Numerical experiments show that the FNO model can accurately reconstruct the
+instantaneous spatial velocity field. We further evaluate the trained FNO model
+on unseen data with different wind directions, and the results show that the
+FNO model can generalize well on different wind directions. More importantly,
+the FNO approach can make predictions within milliseconds on the graphics
+processing unit, making real-time simulation of 3D dynamic urban microclimate
+possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PUG: Photorealistic and Semantically Controllable Synthetic Data for
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Bordes, Shashank Shekhar, Mark Ibrahim, Diane Bouchacourt, Pascal Vincent, Ari S. Morcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic image datasets offer unmatched advantages for designing and
+evaluating deep neural networks: they make it possible to (i) render as many
+data samples as needed, (ii) precisely control each scene and yield granular
+ground truth labels (and captions), (iii) precisely control distribution shifts
+between training and testing to isolate variables of interest for sound
+experimentation. Despite such promise, the use of synthetic image data is still
+limited -- and often played down -- mainly due to their lack of realism. Most
+works therefore rely on datasets of real images, which have often been scraped
+from public images on the internet, and may have issues with regards to
+privacy, bias, and copyright, while offering little control over how objects
+precisely appear. In this work, we present a path to democratize the use of
+photorealistic synthetic data: we develop a new generation of interactive
+environments for representation learning research, that offer both
+controllability and realism. We use the Unreal Engine, a powerful game engine
+well known in the entertainment industry, to produce PUG (Photorealistic Unreal
+Graphics) environments and datasets for representation learning. In this paper,
+we demonstrate the potential of PUG to enable more rigorous evaluations of
+vision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiclass Online Learnability under Bandit Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ananth Raman, Vinod Raman, Unique Subedi, Ambuj Tewari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study online multiclass classification under bandit feedback. We extend
+the results of (daniely2013price) by showing that the finiteness of the Bandit
+Littlestone dimension is necessary and sufficient for bandit online multiclass
+learnability even when the label space is unbounded. Our result complements the
+recent work by (hanneke2023multiclass) who show that the Littlestone dimension
+characterizes online multiclass learnability in the full-information setting
+when the label space is unbounded.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Activation Clipping for Universal Backdoor Mitigation and
+  Test-Time Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Wang, Zhen Xiang, David J. Miller, George Kesidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to backdoor attacks (Trojans), where an
+attacker poisons the training set with backdoor triggers so that the neural
+network learns to classify test-time triggers to the attacker's designated
+target class. Recent work shows that backdoor poisoning induces over-fitting
+(abnormally large activations) in the attacked model, which motivates a
+general, post-training clipping method for backdoor mitigation, i.e., with
+bounds on internal-layer activations learned using a small set of clean
+samples. We devise a new such approach, choosing the activation bounds to
+explicitly limit classification margins. This method gives superior performance
+against peer methods for CIFAR-10 image classification. We also show that this
+method has strong robustness against adaptive attacks, X2X attacks, and on
+different datasets. Finally, we demonstrate a method extension for test-time
+detection and correction based on the output differences between the original
+and activation-bounded networks. The code of our method is online available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning, Deep Learning and Data Preprocessing Techniques for
+  Detection, Prediction, and Monitoring of Stress and Stress-related Mental
+  Disorders: A Scoping <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moein Razavi, Samira Ziyadidegan, Reza Jahromi, Saber Kazeminasab, Vahid Janfaza, Ahmadreza Mahmoudzadeh, Elaheh Baharlouei, Farzan Sasangohar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This comprehensive review systematically evaluates Machine Learning (ML)
+methodologies employed in the detection, prediction, and analysis of mental
+stress and its consequent mental disorders (MDs). Utilizing a rigorous scoping
+review process, the investigation delves into the latest ML algorithms,
+preprocessing techniques, and data types employed in the context of stress and
+stress-related MDs. The findings highlight that Support Vector Machine (SVM),
+Neural Network (NN), and Random Forest (RF) models consistently exhibit
+superior accuracy and robustness among all machine learning algorithms
+examined. Furthermore, the review underscores that physiological parameters,
+such as heart rate measurements and skin response, are prevalently used as
+stress predictors in ML algorithms. This is attributed to their rich
+explanatory information concerning stress and stress-related MDs, as well as
+the relative ease of data acquisition. Additionally, the application of
+dimensionality reduction techniques, including mappings, feature selection,
+filtering, and noise reduction, is frequently observed as a crucial step
+preceding the training of ML algorithms. The synthesis of this review
+identifies significant research gaps and outlines future directions for the
+field. These encompass areas such as model interpretability, model
+personalization, the incorporation of naturalistic settings, and real-time
+processing capabilities for detection and prediction of stress and
+stress-related MDs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Array Design for Direction Finding using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04615v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04615v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kumar Vijay Mishra, Ahmet M. Elbir, Koichi Ichige
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past few years, deep learning (DL) techniques have been introduced for
+designing sparse arrays. These methods offer the advantages of feature
+engineering and low prediction-stage complexity, which is helpful in tackling
+the combinatorial search inherent to finding a sparse array. In this chapter,
+we provide a synopsis of several direction finding applications of DL-based
+sparse arrays. We begin by examining supervised and transfer learning
+techniques that have applications in selecting sparse arrays for a cognitive
+radar application. Here, we also discuss the use of meta-heuristic learning
+algorithms such as simulated annealing for the case of designing
+two-dimensional sparse arrays. Next, we consider DL-based antenna selection for
+wireless communications, wherein sparse array problem may also be combined with
+channel estimation, beamforming, or localization. Finally, we provide an
+example of deep sparse array technique for integrated sensing and
+communications (ISAC) application, wherein a trade-off of radar and
+communications performance makes ISAC sparse array problem very challenging.
+For each setting, we illustrate the performance of model-based optimization and
+DL techniques through several numerical experiments. We discuss additional
+considerations required to ensure robustness of DL-based algorithms against
+various imperfections in array data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Book chapter, 39 pages, 20 figures, 4 tables. arXiv admin note:
+  substantial text overlap with arXiv:2004.11637</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Driven Detection of Tsunami Related Internal GravityWaves:
+  a path towards open-ocean natural hazards detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentino Constantinou, Michela Ravanelli, Hamlin Liu, Jacob Bortnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tsunamis can trigger internal gravity waves (IGWs) in the ionosphere,
+perturbing the Total Electron Content (TEC) - referred to as Traveling
+Ionospheric Disturbances (TIDs) that are detectable through the Global
+Navigation Satellite System (GNSS). The GNSS are constellations of satellites
+providing signals from Earth orbit - Europe's Galileo, the United States'
+Global Positioning System (GPS), Russia's Global'naya Navigatsionnaya
+Sputnikovaya Sistema (GLONASS) and China's BeiDou. The real-time detection of
+TIDs provides an approach for tsunami detection, enhancing early warning
+systems by providing open-ocean coverage in geographic areas not serviceable by
+buoy-based warning systems. Large volumes of the GNSS data is leveraged by deep
+learning, which effectively handles complex non-linear relationships across
+thousands of data streams. We describe a framework leveraging slant total
+electron content (sTEC) from the VARION (Variometric Approach for Real-Time
+Ionosphere Observation) algorithm by Gramian Angular Difference Fields (from
+Computer Vision) and Convolutional Neural Networks (CNNs) to detect TIDs in
+near-real-time. Historical data from the 2010 Maule, 2011 Tohoku and the 2012
+Haida-Gwaii earthquakes and tsunamis are used in model training, and the
+later-occurring 2015 Illapel earthquake and tsunami in Chile for out-of-sample
+model validation. Using the experimental framework described in the paper, we
+achieved a 91.7% F1 score. Source code is available at:
+https://github.com/vc1492a/tidd. Our work represents a new frontier in
+detecting tsunami-driven IGWs in open-ocean, dramatically improving the
+potential for natural hazards detection for coastal communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSRFlow: Probabilistic Super Resolution with Flow-Based Models for
+  Scientific Data <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Shen, Han-Wei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although many deep-learning-based super-resolution approaches have been
+proposed in recent years, because no ground truth is available in the inference
+stage, few can quantify the errors and uncertainties of the super-resolved
+results. For scientific visualization applications, however, conveying
+uncertainties of the results to scientists is crucial to avoid generating
+misleading or incorrect information. In this paper, we propose PSRFlow, a novel
+normalizing flow-based generative model for scientific data super-resolution
+that incorporates uncertainty quantification into the super-resolution process.
+PSRFlow learns the conditional distribution of the high-resolution data based
+on the low-resolution counterpart. By sampling from a Gaussian latent space
+that captures the missing information in the high-resolution data, one can
+generate different plausible super-resolution outputs. The efficient sampling
+in the Gaussian latent space allows our model to perform uncertainty
+quantification for the super-resolved results. During model training, we
+augment the training data with samples across various scales to make the model
+adaptable to data of different scales, achieving flexible super-resolution for
+a given input. Our results demonstrate superior performance and robust
+uncertainty quantification compared with existing methods such as interpolation
+and GAN-based super-resolution networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proc. IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Decentralized Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edoardo Gabrielli, Giovanni Pica, Gabriele Tolomei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, federated learning (FL) has become a very popular paradigm
+for training distributed, large-scale, and privacy-preserving machine learning
+(ML) systems. In contrast to standard ML, where data must be collected at the
+exact location where training is performed, FL takes advantage of the
+computational capabilities of millions of edge devices to collaboratively train
+a shared, global model without disclosing their local private data.
+Specifically, in a typical FL system, the central server acts only as an
+orchestrator; it iteratively gathers and aggregates all the local models
+trained by each client on its private data until convergence. Although FL
+undoubtedly has several benefits over traditional ML (e.g., it protects private
+data ownership by design), it suffers from several weaknesses. One of the most
+critical challenges is to overcome the centralized orchestration of the
+classical FL client-server architecture, which is known to be vulnerable to
+single-point-of-failure risks and man-in-the-middle attacks, among others. To
+mitigate such exposure, decentralized FL solutions have emerged where all FL
+clients cooperate and communicate without a central server. This survey
+comprehensively summarizes and reviews existing decentralized FL approaches
+proposed in the literature. Furthermore, it identifies emerging challenges and
+suggests promising research directions in this under-explored domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning based Image Watermarking: A Brief <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhong, Arjon Das, Fahad Alrasheedi, Abdullah Tanvir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The act of secretly embedding and extracting a watermark on a cover image to
+protect it is known as image watermarking. In recent years, deep learning-based
+image watermarking techniques have been emerging one after another. To study
+the state-of-the-art, this survey categorizes cutting-edge deep learning-based
+image watermarking techniques into Embedder-Extractor Joint Training, Deep
+Networks as a Feature Transformation, and Hybrid schemes. Research directions
+in each category are also analyzed and summarized. Additionally, potential
+future research directions are discussed to envision future studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantization Aware Factorization for Deep Neural Network Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daria Cherniuk, Stanislav Abukhovich, Anh-Huy Phan, Ivan Oseledets, Andrzej Cichocki, Julia Gusak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor decomposition of convolutional and fully-connected layers is an
+effective way to reduce parameters and FLOP in neural networks. Due to memory
+and power consumption limitations of mobile or embedded devices, the
+quantization step is usually necessary when pre-trained models are deployed. A
+conventional post-training quantization approach applied to networks with
+decomposed weights yields a drop in accuracy. This motivated us to develop an
+algorithm that finds tensor approximation directly with quantized factors and
+thus benefit from both compression techniques while keeping the prediction
+quality of the model. Namely, we propose to use Alternating Direction Method of
+Multipliers (ADMM) for Canonical Polyadic (CP) decomposition with factors whose
+elements lie on a specified quantization grid. We compress neural network
+weights with a devised algorithm and evaluate it's prediction quality and
+performance. We compare our approach to state-of-the-art post-training
+quantization methods and demonstrate competitive results and high flexibility
+in achiving a desirable quality-performance tradeoff.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ScatterUQ: Interactive Uncertainty Visualizations for Multiclass Deep
+  Learning Problems <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Li, Steven Jorgensen, John Holodnak, Allan Wollaber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, uncertainty-aware deep learning methods for multiclass labeling
+problems have been developed that provide calibrated class prediction
+probabilities and out-of-distribution (OOD) indicators, letting machine
+learning (ML) consumers and engineers gauge a model's confidence in its
+predictions. However, this extra neural network prediction information is
+challenging to scalably convey visually for arbitrary data sources under
+multiple uncertainty contexts. To address these challenges, we present
+ScatterUQ, an interactive system that provides targeted visualizations to allow
+users to better understand model performance in context-driven uncertainty
+settings. ScatterUQ leverages recent advances in distance-aware neural
+networks, together with dimensionality reduction techniques, to construct
+robust, 2-D scatter plots explaining why a model predicts a test example to be
+(1) in-distribution and of a particular class, (2) in-distribution but unsure
+of the class, and (3) out-of-distribution. ML consumers and engineers can
+visually compare the salient features of test samples with training examples
+through the use of a ``hover callback'' to understand model uncertainty
+performance and decide follow up courses of action. We demonstrate the
+effectiveness of ScatterUQ to explain model uncertainty for a multiclass image
+classification on a distance-aware neural network trained on Fashion-MNIST and
+tested on Fashion-MNIST (in distribution) and MNIST digits (out of
+distribution), as well as a deep learning model for a cyber dataset. We
+quantitatively evaluate dimensionality reduction techniques to optimize our
+contextually driven UQ visualizations. Our results indicate that the ScatterUQ
+system should scale to arbitrary, multiclass datasets. Our code is available at
+https://github.com/mit-ll-responsible-ai/equine-webapp
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, accepted to IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernel Single Proxy Control for Deterministic Confounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Xu, Arthur Gretton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of causal effect estimation with an unobserved
+confounder, where we observe a proxy variable that is associated with the
+confounder. Although Proxy Causal Learning (PCL) uses two proxy variables to
+recover the true causal effect, we show that a single proxy variable is
+sufficient for causal estimation if the outcome is generated deterministically,
+generalizing Control Outcome Calibration Approach (COCA). We propose two
+kernel-based methods for this setting: the first based on the two-stage
+regression approach, and the second based on a maximum moment restriction
+approach. We prove that both approaches can consistently estimate the causal
+effect, and we empirically demonstrate that we can successfully recover the
+causal effect on a synthetic dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose
+  Recommendation System? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Pesaranghader, Touqir Sajed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past two decades, recommendation systems (RSs) have used machine
+learning (ML) solutions to recommend items, e.g., movies, books, and
+restaurants, to clients of a business or an online platform. Recipe
+recommendation, however, has not yet received much attention compared to those
+applications. We introduce RECipe as a multi-purpose recipe recommendation
+framework with a multi-modal knowledge graph (MMKG) backbone. The motivation
+behind RECipe is to go beyond (deep) neural collaborative filtering (NCF) by
+recommending recipes to users when they query in natural language or by
+providing an image. RECipe consists of 3 subsystems: (1) behavior-based
+recommender, (2) review-based recommender, and (3) image-based recommender.
+Each subsystem relies on the embedding representations of entities and
+relations in the graph. We first obtain (pre-trained) embedding representations
+of textual entities, such as reviews or ingredients, from a fine-tuned model of
+Microsoft's MPNet. We initialize the weights of the entities with these
+embeddings to train our knowledge graph embedding (KGE) model. For the visual
+component, i.e., recipe images, we develop a KGE-Guided variational autoencoder
+(KG-VAE) to learn the distribution of images and their latent representations.
+Once KGE and KG-VAE models are fully trained, we use them as a multi-purpose
+recommendation framework. For benchmarking, we created two knowledge graphs
+(KGs) from public datasets on Kaggle for recipe recommendation. Our experiments
+show that the KGE models have comparable performance to the neural solutions.
+We also present pre-trained NLP embeddings to address important applications
+such as zero-shot inference for new users (or the cold start problem) and
+conditional recommendation with respect to recipe categories. We eventually
+demonstrate the application of RECipe in a multi-purpose recommendation
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Fake to Real (FFR): A two-stage training pipeline for mitigating
+  spurious correlations with synthetic data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maan Qraitem, Kate Saenko, Bryan A. Plummer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual recognition models are prone to learning spurious correlations induced
+by an imbalanced training set where certain groups (\eg Females) are
+under-represented in certain classes (\eg Programmers). Generative models offer
+a promising direction in mitigating this bias by generating synthetic data for
+the minority samples and thus balancing the training set. However, prior work
+that uses these approaches overlooks that visual recognition models could often
+learn to differentiate between real and synthetic images and thus fail to
+unlearn the bias in the original dataset. In our work, we propose a novel
+two-stage pipeline to mitigate this issue where 1) we pre-train a model on a
+balanced synthetic dataset and then 2) fine-tune on the real data. Using this
+pipeline, we avoid training on both real and synthetic data, thus avoiding the
+bias between real and synthetic data. Moreover, we learn robust features
+against the bias in the first step that mitigate the bias in the second step.
+Moreover, our pipeline naturally integrates with bias mitigation methods; they
+can be simply applied to the fine-tuning step. As our experiments prove, our
+pipeline can further improve the performance of bias mitigation methods
+obtaining state-of-the-art performance on three large-scale datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Medical Image Classification in Noisy Labels Using Only
+  <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pretrain</span>ing <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bidur Khanal, Binod Bhattarai, Bishesh Khanal, Cristian A. Linte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Noisy labels hurt deep learning-based supervised image classification
+performance as the models may overfit the noise and learn corrupted feature
+extractors. For natural image classification training with noisy labeled data,
+model initialization with contrastive self-supervised pretrained weights has
+shown to reduce feature corruption and improve classification performance.
+However, no works have explored: i) how other self-supervised approaches, such
+as pretext task-based pretraining, impact the learning with noisy label, and
+ii) any self-supervised pretraining methods alone for medical images in noisy
+label settings. Medical images often feature smaller datasets and subtle inter
+class variations, requiring human expertise to ensure correct classification.
+Thus, it is not clear if the methods improving learning with noisy labels in
+natural image datasets such as CIFAR would also help with medical images. In
+this work, we explore contrastive and pretext task-based self-supervised
+pretraining to initialize the weights of a deep learning classification model
+for two medical datasets with self-induced noisy labels -- NCT-CRC-HE-100K
+tissue histological images and COVID-QU-Ex chest X-ray images. Our results show
+that models initialized with pretrained weights obtained from self-supervised
+learning can effectively learn better features and improve robustness against
+noisy labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023 DEMI Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Performance in Continual Learning Tasks using Bio-Inspired
+  Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandeep Madireddy, Angel Yanguas-Gil, Prasanna Balaprakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to learn continuously from an incoming data stream without
+catastrophic forgetting is critical to designing intelligent systems. Many
+approaches to continual learning rely on stochastic gradient descent and its
+variants that employ global error updates, and hence need to adopt strategies
+such as memory buffers or replay to circumvent its stability, greed, and
+short-term memory limitations. To address this limitation, we have developed a
+biologically inspired lightweight neural network architecture that incorporates
+synaptic plasticity mechanisms and neuromodulation and hence learns through
+local error signals to enable online continual learning without stochastic
+gradient descent.
+  Our approach leads to superior online continual learning performance on
+Split-MNIST, Split-CIFAR-10, and Split-CIFAR-100 datasets compared to other
+memory-constrained learning approaches and matches that of the state-of-the-art
+memory-intensive replay-based approaches. We further demonstrate the
+effectiveness of our approach by integrating key design concepts into other
+backpropagation-based continual learning algorithms, significantly improving
+their accuracy. Our results provide compelling evidence for the importance of
+incorporating biological principles into machine learning models and offer
+insights into how we can leverage them to design more efficient and robust
+systems for online continual learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Diverse Data Types Steganalysis: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar, Mustapha Hemis, Yassine Himeur, David Megías, Abbes Amira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography and steganalysis are two interrelated aspects of the field of
+information security. Steganography seeks to conceal communications, whereas
+steganalysis is aimed to either find them or even, if possible, recover the
+data they contain. Steganography and steganalysis have attracted a great deal
+of interest, particularly from law enforcement. Steganography is often used by
+cybercriminals and even terrorists to avoid being captured while in possession
+of incriminating evidence, even encrypted, since cryptography is prohibited or
+restricted in many countries. Therefore, knowledge of cutting-edge techniques
+to uncover concealed information is crucial in exposing illegal acts. Over the
+last few years, a number of strong and reliable steganography and steganalysis
+techniques have been introduced in the literature. This review paper provides a
+comprehensive overview of deep learning-based steganalysis techniques used to
+detect hidden information within digital media. The paper covers all types of
+cover in steganalysis, including image, audio, and video, and discusses the
+most commonly used deep learning techniques. In addition, the paper explores
+the use of more advanced deep learning techniques, such as deep transfer
+learning (DTL) and deep reinforcement learning (DRL), to enhance the
+performance of steganalysis systems. The paper provides a systematic review of
+recent research in the field, including data sets and evaluation metrics used
+in recent studies. It also presents a detailed analysis of DTL-based
+steganalysis approaches and their performance on different data sets. The
+review concludes with a discussion on the current state of deep learning-based
+steganalysis, challenges, and future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MT-IceNet -- A Spatial and Multi-Temporal Deep Learning Model for Arctic
+  Sea Ice Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahara Ali, Jianwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Arctic amplification has altered the climate patterns both regionally and
+globally, resulting in more frequent and more intense extreme weather events in
+the past few decades. The essential part of Arctic amplification is the
+unprecedented sea ice loss as demonstrated by satellite observations.
+Accurately forecasting Arctic sea ice from sub-seasonal to seasonal scales has
+been a major research question with fundamental challenges at play. In addition
+to physics-based Earth system models, researchers have been applying multiple
+statistical and machine learning models for sea ice forecasting. Looking at the
+potential of data-driven approaches to study sea ice variations, we propose
+MT-IceNet - a UNet based spatial and multi-temporal (MT) deep learning model
+for forecasting Arctic sea ice concentration (SIC). The model uses an
+encoder-decoder architecture with skip connections and processes multi-temporal
+input streams to regenerate spatial maps at future timesteps. Using bi-monthly
+and monthly satellite retrieved sea ice data from NSIDC as well as atmospheric
+and oceanic variables from ERA5 reanalysis product during 1979-2021, we show
+that our proposed model provides promising predictive performance for per-pixel
+SIC forecasting with up to 60% decrease in prediction error for a lead time of
+6 months as compared to its state-of-the-art counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE BDCAT 2022. This version includes minor updates
+  made in the text after original publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient option pricing with unary-based photonic computing chip and
+  generative adversarial learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui Zhang, Lingxiao Wan, Sergi Ramos-Calderer, Yuancheng Zhan, Wai-Keong Mok, Hong Cai, Feng Gao, Xianshu Luo, Guo-Qiang Lo, Leong Chuan Kwek, José Ignacio Latorre, Ai Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the modern financial industry system, the structure of products has become
+more and more complex, and the bottleneck constraint of classical computing
+power has already restricted the development of the financial industry. Here,
+we present a photonic chip that implements the unary approach to European
+option pricing, in combination with the quantum amplitude estimation algorithm,
+to achieve a quadratic speedup compared to classical Monte Carlo methods. The
+circuit consists of three modules: a module loading the distribution of asset
+prices, a module computing the expected payoff, and a module performing the
+quantum amplitude estimation algorithm to introduce speed-ups. In the
+distribution module, a generative adversarial network is embedded for efficient
+learning and loading of asset distributions, which precisely capture the market
+trends. This work is a step forward in the development of specialized photonic
+processors for applications in finance, with the potential to improve the
+efficiency and quality of financial services.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning To Rank Diversely At Airbnb 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malay Haldar, Mustafa Abdool, Liwei He, Dillon Davis, Huiji Gao, Sanjeev Katariya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Airbnb is a two-sided marketplace, bringing together hosts who own listings
+for rent, with prospective guests from around the globe. Applying neural
+network-based learning to rank techniques has led to significant improvements
+in matching guests with hosts. These improvements in ranking were driven by a
+core strategy: order the listings by their estimated booking probabilities,
+then iterate on techniques to make these booking probability estimates more and
+more accurate. Embedded implicitly in this strategy was an assumption that the
+booking probability of a listing could be determined independently of other
+listings in search results. In this paper we discuss how this assumption,
+pervasive throughout the commonly-used learning to rank frameworks, is false.
+We provide a theoretical foundation correcting this assumption, followed by
+efficient neural network architectures based on the theory. Explicitly
+accounting for possible similarities between listings, and reducing them to
+diversify the search results generated strong positive impact. We discuss these
+metric wins as part of the online A/B tests of the theory. Our method provides
+a practical way to diversify search results for large-scale production ranking
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Search ranking, Diversity, e-commerce</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Data Attribution for Text-to-Image Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Yu Wang, Alexei A. Efros, Jun-Yan Zhu, Richard Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large text-to-image models are able to synthesize "novel" images, these
+images are necessarily a reflection of the training data. The problem of data
+attribution in such models -- which of the images in the training set are most
+responsible for the appearance of a given generated image -- is a difficult yet
+important one. As an initial step toward this problem, we evaluate attribution
+through "customization" methods, which tune an existing large-scale model
+toward a given exemplar object or style. Our key insight is that this allows us
+to efficiently create synthetic images that are computationally influenced by
+the exemplar by construction. With our new dataset of such exemplar-influenced
+images, we are able to evaluate various data attribution algorithms and
+different possible feature spaces. Furthermore, by training on our dataset, we
+can tune standard models, such as DINO, CLIP, and ViT, toward the attribution
+problem. Even though the procedure is tuned towards small exemplar sets, we
+show generalization to larger sets. Finally, by taking into account the
+inherent uncertainty of the problem, we can assign soft attribution scores over
+a set of training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated v2 -- ICCV 2023 camera ready version. Project page:
+  https://peterwang512.github.io/GenDataAttribution Code:
+  https://github.com/PeterWang512/GenDataAttribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shuffle SGD is Always Better than SGD: Improved Analysis of SGD with
+  Arbitrary Data Orders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Koloskova, Nikita Doikov, Sebastian U. Stich, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic Gradient Descent (SGD) algorithms are widely used in optimizing
+neural networks, with Random Reshuffling (RR) and Single Shuffle (SS) being
+popular choices for cycling through random or single permutations of the
+training data. However, the convergence properties of these algorithms in the
+non-convex case are not fully understood. Existing results suggest that, in
+realistic training scenarios where the number of epochs is smaller than the
+training set size, RR may perform worse than SGD.
+  In this paper, we analyze a general SGD algorithm that allows for arbitrary
+data orderings and show improved convergence rates for non-convex functions.
+Specifically, our analysis reveals that SGD with random and single shuffling is
+always faster or at least as good as classical SGD with replacement, regardless
+of the number of iterations. Overall, our study highlights the benefits of
+using SGD with random/single shuffling and provides new insights into its
+convergence properties for non-convex optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smoothing the Edges: A General Framework for Smooth Optimization in
+  Sparse Regularization using Hadamard Overparametrization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chris Kolb, Christian L. Müller, Bernd Bischl, David Rügamer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework for smooth optimization of objectives with
+$\ell_q$ and $\ell_{p,q}$ regularization for (structured) sparsity. Finding
+solutions to these non-smooth and possibly non-convex problems typically relies
+on specialized optimization routines. In contrast, the method studied here is
+compatible with off-the-shelf (stochastic) gradient descent that is ubiquitous
+in deep learning, thereby enabling differentiable sparse regularization without
+approximations. The proposed optimization transfer comprises an
+overparametrization of selected model parameters followed by a change of
+penalties. In the overparametrized problem, smooth and convex $\ell_2$
+regularization induces non-smooth and non-convex regularization in the original
+parametrization. We show that the resulting surrogate problem not only has an
+identical global optimum but also exactly preserves the local minima. This is
+particularly useful in non-convex regularization, where finding global
+solutions is NP-hard and local minima often generalize well. We provide an
+integrative overview that consolidates various literature strands on
+sparsity-inducing parametrizations in a general setting and meaningfully extend
+existing approaches. The feasibility of our approach is evaluated through
+numerical experiments, demonstrating its effectiveness by matching or
+outperforming common implementations of convex and non-convex regularizers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PMAA: A Progressive Multi-scale Attention Autoencoder Model for
+  High-performance Cloud Removal from Multi-temporal Satellite Imagery <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Pin Tao, Yachao Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery analysis plays a pivotal role in remote sensing; however,
+information loss due to cloud cover significantly impedes its application.
+Although existing deep cloud removal models have achieved notable outcomes,
+they scarcely consider contextual information. This study introduces a
+high-performance cloud removal architecture, termed Progressive Multi-scale
+Attention Autoencoder (PMAA), which concurrently harnesses global and local
+information to construct robust contextual dependencies using a novel
+Multi-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).
+PMAA establishes long-range dependencies of multi-scale features using MAM and
+modulates the reconstruction of fine-grained details utilizing LIM, enabling
+simultaneous representation of fine- and coarse-grained features at the same
+level. With the help of diverse and multi-scale features, PMAA consistently
+outperforms the previous state-of-the-art model CTGAN on two benchmark
+datasets. Moreover, PMAA boasts considerable efficiency advantages, with only
+0.5% and 14.6% of the parameters and computational complexity of CTGAN,
+respectively. These comprehensive results underscore PMAA's potential as a
+lightweight cloud removal network suitable for deployment on edge devices to
+accomplish large-scale cloud removal tasks. Our source code and pre-trained
+models are available at https://github.com/XavierJiezou/PMAA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas David, Helio Pedrini, Zanoni Dias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate the necessity for large amounts of supervised segmentation
+annotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)
+strategies have been devised. These will often rely on advanced data and model
+regularization strategies to instigate the development of useful properties
+(e.g., prediction completeness and fidelity to semantic boundaries) in
+segmentation priors, notwithstanding the lack of annotated information. In this
+work, we first create a strong baseline by analyzing complementary WSSS
+techniques and regularizing strategies, considering their strengths and
+limitations. We then propose a new Class-specific Adversarial Erasing strategy,
+comprising two adversarial CAM generating networks being gradually refined to
+produce robust semantic segmentation proposals. Empirical results suggest that
+our approach induces substantial improvement in the effectiveness of the
+baseline, resulting in a noticeable improvement over both Pascal VOC 2012 and
+MS COCO 2014 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MonoFlow: Rethinking Divergence GANs via the Perspective of Wasserstein
+  Gradient Flows <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01075v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01075v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxuan Yi, Zhanxing Zhu, Song Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional understanding of adversarial training in generative
+adversarial networks (GANs) is that the discriminator is trained to estimate a
+divergence, and the generator learns to minimize this divergence. We argue that
+despite the fact that many variants of GANs were developed following this
+paradigm, the current theoretical understanding of GANs and their practical
+algorithms are inconsistent. In this paper, we leverage Wasserstein gradient
+flows which characterize the evolution of particles in the sample space, to
+gain theoretical insights and algorithmic inspiration of GANs. We introduce a
+unified generative modeling framework - MonoFlow: the particle evolution is
+rescaled via a monotonically increasing mapping of the log density ratio. Under
+our framework, adversarial training can be viewed as a procedure first
+obtaining MonoFlow's vector field via training the discriminator and the
+generator learns to draw the particle flow defined by the corresponding vector
+field. We also reveal the fundamental difference between variational divergence
+minimization and adversarial training. This analysis helps us to identify what
+types of generator loss functions can lead to the successful training of GANs
+and suggest that GANs may have more loss designs beyond the literature (e.g.,
+non-saturated loss), as long as they realize MonoFlow. Consistent empirical
+studies are included to validate the effectiveness of our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Set-based value operators for non-stationary Markovian environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.07271v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.07271v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah H. Q. Li, Assalé Adjé, Pierre-Loïc Garoche, Behçet Açıkmeşe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper analyzes finite state Markov Decision Processes (MDPs) with
+uncertain parameters in compact sets and re-examines results from robust MDP
+via set-based fixed point theory. To this end, we generalize the Bellman and
+policy evaluation operators to contracting operators on the value function
+space and denote them as \emph{value operators}. We lift these value operators
+to act on \emph{sets} of value functions and denote them as \emph{set-based
+value operators}. We prove that the set-based value operators are
+\emph{contractions} in the space of compact value function sets. Leveraging
+insights from set theory, we generalize the rectangularity condition in classic
+robust MDP literature to a containment condition for all value operators, which
+is weaker and can be applied to a larger set of parameter-uncertain MDPs and
+contracting operators in dynamic programming. We prove that both the
+rectangularity condition and the containment condition sufficiently ensure that
+the set-based value operator's fixed point set contains its own extrema
+elements. For convex and compact sets of uncertain MDP parameters, we show
+equivalence between the classic robust value function and the supremum of the
+fixed point set of the set-based Bellman operator. Under dynamically changing
+MDP parameters in compact sets, we prove a set convergence result for value
+iteration, which otherwise may not converge to a single value function.
+Finally, we derive novel guarantees for probabilistic path-planning problems in
+planet exploration and stratospheric station-keeping.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inherently Interpretable Multi-Label Classification Using Class-Specific
+  Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Stefano Woerner, Andreas Maier, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is essential for machine learning algorithms in high-stakes
+application fields such as medical image analysis. However, high-performing
+black-box neural networks do not provide explanations for their predictions,
+which can lead to mistrust and suboptimal human-ML collaboration. Post-hoc
+explanation techniques, which are widely used in practice, have been shown to
+suffer from severe conceptual problems. Furthermore, as we show in this paper,
+current explanation techniques do not perform adequately in the multi-label
+scenario, in which multiple medical findings may co-occur in a single image. We
+propose Attri-Net, an inherently interpretable model for multi-label
+classification. Attri-Net is a powerful classifier that provides transparent,
+trustworthy, and human-understandable explanations. The model first generates
+class-specific attribution maps based on counterfactuals to identify which
+image regions correspond to certain medical findings. Then a simple logistic
+regression classifier is used to make predictions based solely on these
+attribution maps. We compare Attri-Net to five post-hoc explanation techniques
+and one inherently interpretable classifier on three chest X-ray datasets. We
+find that Attri-Net produces high-quality multi-label explanations consistent
+with clinical knowledge and has comparable classification performance to
+state-of-the-art classification models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MIDL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical and Rigorous Uncertainty Bounds for Gaussian Process
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.02796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.02796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Fiedler, Carsten W. Scherer, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Process Regression is a popular nonparametric regression method
+based on Bayesian principles that provides uncertainty estimates for its
+predictions. However, these estimates are of a Bayesian nature, whereas for
+some important applications, like learning-based control with safety
+guarantees, frequentist uncertainty bounds are required. Although such rigorous
+bounds are available for Gaussian Processes, they are too conservative to be
+useful in applications. This often leads practitioners to replacing these
+bounds by heuristics, thus breaking all theoretical guarantees. To address this
+problem, we introduce new uncertainty bounds that are rigorous, yet practically
+useful at the same time. In particular, the bounds can be explicitly evaluated
+and are much less conservative than state of the art results. Furthermore, we
+show that certain model misspecifications lead to only graceful degradation. We
+demonstrate these advantages and the usefulness of our results for
+learning-based control with numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Contains supplementary material and corrections to the original
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genie: Show Me the Data for Quantization <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkweon Jeon, Chungman Lee, Ho-young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot quantization is a promising approach for developing lightweight
+deep neural networks when data is inaccessible owing to various reasons,
+including cost and issues related to privacy. By exploiting the learned
+parameters ($\mu$ and $\sigma$) of batch normalization layers in an
+FP32-pre-trained model, zero-shot quantization schemes focus on generating
+synthetic data. Subsequently, they distill knowledge from the pre-trained model
+(teacher) to the quantized model (student) such that the quantized model can be
+optimized with the synthetic dataset. However, thus far, zero-shot quantization
+has primarily been discussed in the context of quantization-aware training
+methods, which require task-specific losses and long-term optimization as much
+as retraining. We thus introduce a post-training quantization scheme for
+zero-shot quantization that produces high-quality quantized networks within a
+few hours. Furthermore, we propose a framework called Genie~that generates data
+suited for quantization. With the data synthesized by Genie, we can produce
+robust quantized models without real datasets, which is comparable to few-shot
+quantization. We also propose a post-training quantization algorithm to enhance
+the performance of quantized models. By combining them, we can bridge the gap
+between zero-shot and few-shot quantization while significantly improving the
+quantization performance compared to that of existing approaches. In other
+words, we can obtain a unique state-of-the-art zero-shot quantization approach.
+The code is available at \url{https://github.com/SamsungLabs/Genie}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine learning for rapid discovery of laminar flow channel wall
+  modifications that enhance heat transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2101.08130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2101.08130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuri Koide, Arjun J. Kaithakkal, Matthias Schniewind, Bradley P. Ladewig, Alexander Stroh, Pascal Friederich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical simulation of fluids plays an essential role in modeling many
+physical phenomena, which enables technological advancements, contributes to
+sustainable practices, and expands our understanding of various natural and
+engineered systems. The calculation of heat transfer in fluid flow in simple
+flat channels is a relatively easy task for various simulation methods.
+However, once the channel geometry becomes more complex, numerical simulations
+become a bottleneck in optimizing wall geometries. We present a combination of
+accurate numerical simulations of arbitrary, flat, and non-flat channels and
+machine learning models predicting drag coefficient and Stanton number. We show
+that convolutional neural networks (CNN) can accurately predict the target
+properties at a fraction of the time of numerical simulations. We use the CNN
+models in a virtual high-throughput screening approach to explore a large
+number of possible, randomly generated wall architectures. Data Augmentation
+was applied to existing geometries data to add generated new training data
+which have the same number of parameters of heat transfer to improve the
+model's generalization. The general approach is not only applicable to simple
+flow setups as presented here but can be extended to more complex tasks, such
+as multiphase or even reactive unit operations in chemical engineering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and
+  Human-Machine Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Knoche, Gerhard Rigoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, face recognition systems surpass human performance on several
+datasets. However, there are still edge cases that the machine can't correctly
+classify. This paper investigates the effect of a combination of machine and
+human operators in the face verification task. First, we look closer at the
+edge cases for several state-of-the-art models to discover common datasets'
+challenging settings. Then, we conduct a study with 60 participants on these
+selected tasks with humans and provide an extensive analysis. Finally, we
+demonstrate that combining machine and human decisions can further improve the
+performance of state-of-the-art face verification systems on various benchmark
+datasets. Code and data are publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A semantic backdoor attack against Graph Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14353v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14353v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhu Dai, Zhipeng Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) have been very effective in addressing
+the issue of various graph-structured related tasks, such as node
+classification and graph classification. However, recent research has shown
+that GCNs are vulnerable to a new type of threat called a backdoor attack,
+where the adversary can inject a hidden backdoor into GCNs so that the attacked
+model performs well on benign samples, but its prediction will be maliciously
+changed to the attacker-specified target label if the hidden backdoor is
+activated by the attacker-defined trigger. In this paper, we investigate
+whether such semantic backdoor attacks are possible for GCNs and propose a
+semantic backdoor attack against GCNs (SBAG) under the context of graph
+classification to reveal the existence of this security vulnerability in GCNs.
+SBAG uses a certain type of node in the samples as a backdoor trigger and
+injects a hidden backdoor into GCN models by poisoning training data. The
+backdoor will be activated, and the GCN models will give malicious
+classification results specified by the attacker even on unmodified samples as
+long as the samples contain enough trigger nodes. We evaluate SBAG on four
+graph datasets. The experimental results indicate that SBAG can achieve attack
+success rates of approximately 99.9% and over 82% for two kinds of attack
+samples, respectively, with poisoning rates of less than 5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Memory Recursive Least Squares: Recast Forgetting into Memory
+  in RBF Neural Network Based Real-Time Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Fei, Jiangang Li, Yanan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In radial basis function neural network (RBFNN) based real-time learning
+tasks, forgetting mechanisms are widely used such that the neural network can
+keep its sensitivity to new data. However, with forgetting mechanisms, some
+useful knowledge will get lost simply because they are learned a long time ago,
+which we refer to as the passive knowledge forgetting phenomenon. To address
+this problem, this paper proposes a real-time training method named selective
+memory recursive least squares (SMRLS) in which the classical forgetting
+mechanisms are recast into a memory mechanism. Different from the forgetting
+mechanism, which mainly evaluates the importance of samples according to the
+time when samples are collected, the memory mechanism evaluates the importance
+of samples through both temporal and spatial distribution of samples. With
+SMRLS, the input space of the RBFNN is evenly divided into a finite number of
+partitions and a synthesized objective function is developed using synthesized
+samples from each partition. In addition to the current approximation error,
+the neural network also updates its weights according to the recorded data from
+the partition being visited. Compared with classical training methods including
+the forgetting factor recursive least squares (FFRLS) and stochastic gradient
+descent (SGD) methods, SMRLS achieves improved learning speed and
+generalization capability, which are demonstrated by corresponding simulation
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Bayesian Networks with Annealing Machine <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.06926v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.06926v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuta Shikuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have reported that annealing machines are capable of solving
+combinatorial optimization problems with high accuracy. Annealing machines can
+potentially be applied to score-based Bayesian network structure learning.
+However, the bit capacity of an annealing machine is currently limited. To
+utilize the annealing technology, converting score-based learning problems into
+quadratic unconstrained binary optimizations within the bit capacity is
+necessary. In this paper, we propose an efficient conversion method with the
+advanced identification of candidate parent sets and their decomposition. We
+also provide an integer programming problem to find the decomposition that
+minimizes the number of required bits. Experimental results on $7$ benchmark
+datasets with variables from $75$ to $223$ show that our approach requires less
+bits than the $100$K bit capacity of the fourth-generation Fujitsu Digital
+Annealer, a fully coupled annealing machine developed with semiconductor
+technology. Moreover, we demonstrate that the Digital Annealer with our
+conversion method outperforms existing algorithms on score maximization. These
+results highlight the utility of annealing processors in learning Bayesian
+networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 tables, 3 figures, NeurIPS 2023 (under review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Treat Different Negatives Differently: Enriching Loss Functions with
+  Domain and Range Constraints for Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Hubert, Pierre Monnin, Armelle Brun, Davy Monticolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding models (KGEMs) are used for various tasks related
+to knowledge graphs (KGs), including link prediction. They are trained with
+loss functions that are computed considering a batch of scored triples and
+their corresponding labels. Traditional approaches consider the label of a
+triple to be either true or false. However, recent works suggest that all
+negative triples should not be valued equally. In line with this recent
+assumption, we posit that negative triples that are semantically valid w.r.t.
+domain and range constraints might be high-quality negative triples. As such,
+loss functions should treat them differently from semantically invalid negative
+ones. To this aim, we propose semantic-driven versions for the three main loss
+functions for link prediction. In an extensive and controlled experimental
+setting, we show that the proposed loss functions systematically provide
+satisfying results on three public benchmark KGs underpinned with different
+schemas, which demonstrates both the generality and superiority of our proposed
+approach. In fact, the proposed loss functions do (1) lead to better MRR and
+Hits@10 values, (2) drive KGEMs towards better semantic awareness as measured
+by the Sem@K metric. This highlights that semantic information globally
+improves KGEMs, and thus should be incorporated into loss functions. Domains
+and ranges of relations being largely available in schema-defined KGs, this
+makes our approach both beneficial and widely usable in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Composite Goodness-of-fit Tests with Kernels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.10275v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.10275v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Key, Arthur Gretton, François-Xavier Briol, Tamara Fernandez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model misspecification can create significant challenges for the
+implementation of probabilistic models, and this has led to development of a
+range of robust methods which directly account for this issue. However, whether
+these more involved methods are required will depend on whether the model is
+really misspecified, and there is a lack of generally applicable methods to
+answer this question. In this paper, we propose one such method. More
+precisely, we propose kernel-based hypothesis tests for the challenging
+composite testing problem, where we are interested in whether the data comes
+from any distribution in some parametric family. Our tests make use of minimum
+distance estimators based on the maximum mean discrepancy and the kernel Stein
+discrepancy. They are widely applicable, including whenever the density of the
+parametric model is known up to normalisation constant, or if the model takes
+the form of a simulator. As our main result, we show that we are able to
+estimate the parameter and conduct our test on the same data (without data
+splitting), while maintaining a correct test level. Our approach is illustrated
+on a range of problems, including testing for goodness-of-fit of an
+unnormalised non-parametric density model, and an intractable generative model
+of a biological cellular network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking Neural Networks for event-based action recognition: A new task
+  to understand their advantage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Vicente-Sola, Davide L. Manna, Paul Kirkland, Gaetano Di Caterina, Trevor Bihl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNN) are characterised by their unique temporal
+dynamics, but the properties and advantages of such computations are still not
+well understood. In order to provide answers, in this work we demonstrate how
+Spiking neurons can enable temporal feature extraction in feed-forward neural
+networks without the need for recurrent synapses, showing how their
+bio-inspired computing principles can be successfully exploited beyond energy
+efficiency gains and evidencing their differences with respect to conventional
+neurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain
+(DVS-GC), which allows, for the first time, to evaluate the perception of
+temporal dependencies in a real event-based action recognition dataset. Our
+study proves how the widely used DVS Gesture benchmark could be solved by
+networks without temporal feature extraction, unlike the new DVS-GC which
+demands an understanding of the ordering of the events. Furthermore, this setup
+allowed us to unveil the role of the leakage rate in spiking neurons for
+temporal processing tasks and demonstrated the benefits of "hard reset"
+mechanisms. Additionally, we also show how time-dependent weights and
+normalization can lead to understanding order by means of temporal attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New article superseding the one in previous versions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Diffusion Probabilistic Models for Generation of Realistic
+  Fully-Annotated Microscopy Image Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dennis Eschweiler, Rüveyda Yilmaz, Matisse Baumann, Ina Laube, Rijo Roy, Abin Jose, Daniel Brückner, Johannes Stegmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in computer vision have led to significant progress in the
+generation of realistic image data, with denoising diffusion probabilistic
+models proving to be a particularly effective method. In this study, we
+demonstrate that diffusion models can effectively generate fully-annotated
+microscopy image data sets through an unsupervised and intuitive approach,
+using rough sketches of desired structures as the starting point. The proposed
+pipeline helps to reduce the reliance on manual annotations when training deep
+learning-based segmentation approaches and enables the segmentation of diverse
+datasets without the need for human annotations. This approach holds great
+promise in streamlining the data generation process and enabling a more
+efficient and scalable training of segmentation models, as we show in the
+example of different practical experiments involving various organisms and cell
+types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-manipulation of soft-materials estimating deformation from depth
+  images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05609v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05609v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Nicola, Enrico Villagrossi, Nicola Pedrocchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot co-manipulation of soft materials, such as fabrics, composites,
+and sheets of paper/cardboard, is a challenging operation that presents several
+relevant industrial applications. Estimating the deformation state of the
+co-manipulated material is one of the main challenges. Viable methods provide
+the indirect measure by calculating the human-robot relative distance. In this
+paper, we develop a data-driven model to estimate the deformation state of the
+material from a depth image through a Convolutional Neural Network (CNN).
+First, we define the deformation state of the material as the relative
+roto-translation from the current robot pose and a human grasping position. The
+model estimates the current deformation state through a Convolutional Neural
+Network, specifically a DenseNet-121 pretrained on ImageNet.The delta between
+the current and the desired deformation state is fed to the robot controller
+that outputs twist commands. The paper describes the developed approach to
+acquire, preprocess the dataset and train the model. The model is compared with
+the current state-of-the-art method based on a skeletal tracker from cameras.
+Results show that our approach achieves better performances and avoids the
+various drawbacks caused by using a skeletal tracker.Finally, we also studied
+the model performance according to different architectures and dataset
+dimensions to minimize the time required for dataset acquisition
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print, Accepted to Robotics and Computer Integrated Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generation of Realistic Synthetic Raw Radar Data for Automated Driving
+  Applications using Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo C. Fidelis, Fabio Reway, Herick Y. S. Ribeiro, Pietro L. Campos, Werner Huber, Christian Icking, Lester A. Faria, Torsten Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main approaches for simulating FMCW radar are based on ray tracing, which
+is usually computationally intensive and do not account for background noise.
+This work proposes a faster method for FMCW radar simulation capable of
+generating synthetic raw radar data using generative adversarial networks
+(GAN). The code and pre-trained weights are open-source and available on
+GitHub. This method generates 16 simultaneous chirps, which allows the
+generated data to be used for the further development of algorithms for
+processing radar data (filtering and clustering). This can increase the
+potential for data augmentation, e.g., by generating data in non-existent or
+safety-critical scenarios that are not reproducible in real life. In this work,
+the GAN was trained with radar measurements of a motorcycle and used to
+generate synthetic raw radar data of a motorcycle traveling in a straight line.
+For generating this data, the distance of the motorcycle and Gaussian noise are
+used as input to the neural network. The synthetic generated radar chirps were
+evaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth
+(RA) map is calculated twice: first, based on synthetic data using this GAN
+and, second, based on real data. Based on these RA maps, an algorithm with
+adaptive threshold and edge detection is used for object detection. The results
+have shown that the data is realistic in terms of coherent radar reflections of
+the motorcycle and background noise based on the comparison of chirps, the RA
+maps and the object detection results. Thus, the proposed method in this work
+has shown to minimize the simulation-to-reality gap for the generation of radar
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GCformer: An Efficient Framework for Accurate and Scalable Long-Term
+  Multivariate Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YanJun Zhao, Ziqing Ma, Tian Zhou, Liang Sun, Mengni Ye, Yi Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have emerged as promising tools for time series
+forecasting.
+  However, these model cannot make accurate prediction for long input time
+series. On the one hand, they failed to capture global dependencies within time
+series data. On the other hand, the long input sequence usually leads to large
+model size and high time complexity.
+  To address these limitations, we present GCformer, which combines a
+structured global convolutional branch for processing long input sequences with
+a local Transformer-based branch for capturing short, recent signals. A
+cohesive framework for a global convolution kernel has been introduced,
+utilizing three distinct parameterization methods. The selected structured
+convolutional kernel in the global branch has been specifically crafted with
+sublinear complexity, thereby allowing for the efficient and effective
+processing of lengthy and noisy input signals. Empirical studies on six
+benchmark datasets demonstrate that GCformer outperforms state-of-the-art
+methods, reducing MSE error in multivariate time series benchmarks by 4.38% and
+model parameters by 61.92%. In particular, the global convolutional branch can
+serve as a plug-in block to enhance the performance of other models, with an
+average improvement of 31.93\%, including various recently published
+Transformer-based models. Our code is publicly available at
+https://github.com/zyj-111/GCformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORC: Network Group-based Knowledge Distillation using Online Role Change <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.01186v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.01186v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyong Choi, Hyeon Cho, Seokhwa Cheung, Wonjun Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In knowledge distillation, since a single, omnipotent teacher network cannot
+solve all problems, multiple teacher-based knowledge distillations have been
+studied recently. However, sometimes their improvements are not as good as
+expected because some immature teachers may transfer the false knowledge to the
+student. In this paper, to overcome this limitation and take the efficacy of
+the multiple networks, we divide the multiple networks into teacher and student
+groups, respectively. That is, the student group is a set of immature networks
+that require learning the teacher's knowledge, while the teacher group consists
+of the selected networks that are capable of teaching successfully. We propose
+our online role change strategy where the top-ranked networks in the student
+group are able to promote to the teacher group at every iteration. After
+training the teacher group using the error samples of the student group to
+refine the teacher group's knowledge, we transfer the collaborative knowledge
+from the teacher group to the student group successfully. We verify the
+superiority of the proposed method on CIFAR-10, CIFAR-100, and ImageNet which
+achieves high performance. We further show the generality of our method with
+various backbone architectures such as ResNet, WRN, VGG, Mobilenet, and
+Shufflenet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023; Supplementary material would be found at CVF
+  Open Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMD: Unsupervised Model Detection for X2X Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18651v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18651v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xiang, Zidi Xiong, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor (Trojan) attack is a common threat to deep neural networks, where
+samples from one or more source classes embedded with a backdoor trigger will
+be misclassified to adversarial target classes. Existing methods for detecting
+whether a classifier is backdoor attacked are mostly designed for attacks with
+a single adversarial target (e.g., all-to-one attack). To the best of our
+knowledge, without supervision, no existing methods can effectively address the
+more general X2X attack with an arbitrary number of source classes, each paired
+with an arbitrary target class. In this paper, we propose UMD, the first
+Unsupervised Model Detection method that effectively detects X2X backdoor
+attacks via a joint inference of the adversarial (source, target) class pairs.
+In particular, we first define a novel transferability statistic to measure and
+select a subset of putative backdoor class pairs based on a proposed clustering
+approach. Then, these selected class pairs are jointly assessed based on an
+aggregation of their reverse-engineered trigger size for detection inference,
+using a robust and unsupervised anomaly detector we proposed. We conduct
+comprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show
+that our unsupervised UMD outperforms SOTA detectors (even with supervision) by
+17%, 4%, and 8%, respectively, in terms of the detection accuracy against
+diverse X2X attacks. We also show the strong detection performance of UMD
+against several strong adaptive attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 40th International Conference on Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discriminator optimal transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1910.06832v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1910.06832v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akinori Tanaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within a broad class of generative adversarial networks, we show that
+discriminator optimization process increases a lower bound of the dual cost
+function for the Wasserstein distance between the target distribution $p$ and
+the generator distribution $p_G$. It implies that the trained discriminator can
+approximate optimal transport (OT) from $p_G$ to $p$.Based on some experiments
+and a bit of OT theory, we propose a discriminator optimal transport (DOT)
+scheme to improve generated images. We show that it improves inception score
+and FID calculated by un-conditional GAN trained by CIFAR-10, STL-10 and a
+public pre-trained model of conditional GAN by ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>math errors corrected, note added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Coreset Selection for Efficient Robust Training <span class="chip">ECCV2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to adversarial attacks: adding well-crafted,
+imperceptible perturbations to their input can modify their output. Adversarial
+training is one of the most effective approaches to training robust models
+against such attacks. Unfortunately, this method is much slower than vanilla
+training of neural networks since it needs to construct adversarial examples
+for the entire training data at every iteration. By leveraging the theory of
+coreset selection, we show how selecting a small subset of training data
+provides a principled approach to reducing the time complexity of robust
+training. To this end, we first provide convergence guarantees for adversarial
+coreset selection. In particular, we show that the convergence bound is
+directly related to how well our coresets can approximate the gradient computed
+over the entire training data. Motivated by our theoretical analysis, we
+propose using this gradient approximation error as our adversarial coreset
+selection objective to reduce the training set size effectively. Once built, we
+run adversarial training over this subset of the training data. Unlike existing
+methods, our approach can be adapted to a wide variety of training objectives,
+including TRADES, $\ell_p$-PGD, and Perceptual Adversarial Training. We conduct
+extensive experiments to demonstrate that our approach speeds up adversarial
+training by 2-3 times while experiencing a slight degradation in the clean and
+robust accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the International Journal of Computer Vision (IJCV).
+  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:
+  substantial text overlap with arXiv:2112.00378</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tailed Recognition by Mutual Information Maximization between
+  Latent Features and Ground-Truth Labels <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min-Kook Suh, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although contrastive learning methods have shown prevailing performance on a
+variety of representation learning tasks, they encounter difficulty when the
+training dataset is long-tailed. Many researchers have combined contrastive
+learning and a logit adjustment technique to address this problem, but the
+combinations are done ad-hoc and a theoretical background has not yet been
+provided. The goal of this paper is to provide the background and further
+improve the performance. First, we show that the fundamental reason contrastive
+learning methods struggle with long-tailed tasks is that they try to maximize
+the mutual information maximization between latent features and input data. As
+ground-truth labels are not considered in the maximization, they are not able
+to address imbalances between class labels. Rather, we interpret the
+long-tailed recognition task as a mutual information maximization between
+latent features and ground-truth labels. This approach integrates contrastive
+learning and logit adjustment seamlessly to derive a loss function that shows
+state-of-the-art performance on long-tailed recognition benchmarks. It also
+demonstrates its efficacy in image segmentation tasks, verifying its
+versatility beyond image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auto-Encoding Adversarial Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11004v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11004v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaifeng Zhang, Rui Zhao, Ziming Zhang, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) provides a powerful framework for
+decision-making, but its application in practice often requires a carefully
+designed reward function. Adversarial Imitation Learning (AIL) sheds light on
+automatic policy acquisition without access to the reward signal from the
+environment. In this work, we propose Auto-Encoding Adversarial Imitation
+Learning (AEAIL), a robust and scalable AIL framework. To induce expert
+policies from demonstrations, AEAIL utilizes the reconstruction error of an
+auto-encoder as a reward signal, which provides more information for optimizing
+policies than the prior discriminator-based ones. Subsequently, we use the
+derived objective functions to train the auto-encoder and the agent policy.
+Experiments show that our AEAIL performs superior compared to state-of-the-art
+methods on both state and image based environments. More importantly, AEAIL
+shows much better robustness when the expert demonstrations are noisy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Communication-Efficient Framework for Distributed Image Semantic
+  Wireless Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingyan Xie, Yongpeng Wu, Yuxuan Shi, Derrick Wing Kwan Ng, Wenjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-node communication, which refers to the interaction among multiple
+devices, has attracted lots of attention in many Internet-of-Things (IoT)
+scenarios. However, its huge amounts of data flows and inflexibility for task
+extension have triggered the urgent requirement of communication-efficient
+distributed data transmission frameworks. In this paper, inspired by the great
+superiorities on bandwidth reduction and task adaptation of semantic
+communications, we propose a federated learning-based semantic communication
+(FLSC) framework for multi-task distributed image transmission with IoT
+devices. Federated learning enables the design of independent semantic
+communication link of each user while further improves the semantic extraction
+and task performance through global aggregation. Each link in FLSC is composed
+of a hierarchical vision transformer (HVT)-based extractor and a task-adaptive
+translator for coarse-to-fine semantic extraction and meaning translation
+according to specific tasks. In order to extend the FLSC into more realistic
+conditions, we design a channel state information-based multiple-input
+multiple-output transmission module to combat channel fading and noise.
+Simulation results show that the coarse semantic information can deal with a
+range of image-level tasks. Moreover, especially in low signal-to-noise ratio
+and channel bandwidth ratio regimes, FLSC evidently outperforms the traditional
+scheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel
+condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Regularized Learning for Linear-functional Data in Banach
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.03159v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.03159v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we study the whole theory of regularized learning for
+linear-functional data in Banach spaces including representer theorems,
+pseudo-approximation theorems, and convergence theorems. The input training
+data are composed of linear functionals in the predual space of the Banach
+space to represent the discrete local information of multimodel data and
+multiscale models. The training data and the multi-loss functions are used to
+compute the empirical risks to approximate the expected risks, and the
+regularized learning is to minimize the regularized empirical risks over the
+Banach spaces. The exact solutions of the original problems are approximated
+globally by the regularized learning even if the original problems are unknown
+or unformulated. In the convergence theorems, we show the convergence of the
+approximate solutions to the exact solutions by the weak* topology of the
+Banach space. Moreover, the theorems of the regularized learning are applied to
+solve many problems of machine learning such as support vector machines and
+artificial neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>53 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weight Prediction Boosts the Convergence of AdamW 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce weight prediction into the AdamW optimizer to
+boost its convergence when training the deep neural network (DNN) models. In
+particular, ahead of each mini-batch training, we predict the future weights
+according to the update rule of AdamW and then apply the predicted future
+weights to do both forward pass and backward propagation. In this way, the
+AdamW optimizer always utilizes the gradients w.r.t. the future weights instead
+of current weights to update the DNN parameters, making the AdamW optimizer
+achieve better convergence. Our proposal is simple and straightforward to
+implement but effective in boosting the convergence of DNN training. We
+performed extensive experimental evaluations on image classification and
+language modeling tasks to verify the effectiveness of our proposal. The
+experimental results validate that our proposal can boost the convergence of
+AdamW and achieve better accuracy than AdamW when training the DNN models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning <span class="chip">IJCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10579v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10579v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Ren, Yan Kang, Lixin Fan, Linghua Yang, Yongxin Tong, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to
+protect data privacy in vertical federated learning setting. It is widely used
+in fields such as finance and healthcare due to its interpretability,
+effectiveness, and privacy-preserving capability. However, SecureBoost suffers
+from high computational complexity and risk of label leakage. To harness the
+full potential of SecureBoost, hyperparameters of SecureBoost should be
+carefully chosen to strike an optimal balance between utility, efficiency, and
+privacy. Existing methods either set hyperparameters empirically or
+heuristically, which are far from optimal. To fill this gap, we propose a
+Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto
+optimal solutions that each solution is a set of hyperparameters achieving
+optimal tradeoff between utility loss, training cost, and privacy leakage. We
+design measurements of the three objectives. In particular, the privacy leakage
+is measured using our proposed instance clustering attack. Experimental results
+demonstrate that the CMOSB yields not only hyperparameters superior to the
+baseline but also optimal sets of hyperparameters that can support the flexible
+requirements of FL participants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FL-IJCAI'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNNBuilder: An Automated Framework for Generic Graph Neural Network
+  Accelerator Generation, Simulation, and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16459v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16459v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Abi-Karam, Cong Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are plenty of graph neural network (GNN) accelerators being proposed.
+However, they highly rely on users' hardware expertise and are usually
+optimized for one specific GNN model, making them challenging for practical
+use. Therefore, in this work, we propose GNNBuilder, the first automated,
+generic, end-to-end GNN accelerator generation framework. It features four
+advantages: (1) GNNBuilder can automatically generate GNN accelerators for a
+wide range of GNN models arbitrarily defined by users; (2) GNNBuilder takes
+standard PyTorch programming interface, introducing zero overhead for algorithm
+developers; (3) GNNBuilder supports end-to-end code generation, simulation,
+accelerator optimization, and hardware deployment, realizing a push-button
+fashion for GNN accelerator design; (4) GNNBuilder is equipped with accurate
+performance models of its generated accelerator, enabling fast and flexible
+design space exploration (DSE). In the experiments, first, we show that our
+accelerator performance model has errors within $36\%$ for latency prediction
+and $18\%$ for BRAM count prediction. Second, we show that our generated
+accelerators can outperform CPU by $6.33\times$ and GPU by $6.87\times$. This
+framework is open-source, and the code is available at
+https://github.com/sharc-lab/gnn-builder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 4 tables, 3 listings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GAD-NR: Graph Anomaly Detection via Neighborhood Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01951v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01951v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Roy, Juan Shu, Jia Li, Carl Yang, Olivier Elshocht, Jeroen Smeets, Pan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Anomaly Detection (GAD) is a technique used to identify abnormal nodes
+within graphs, finding applications in network security, fraud detection,
+social media spam detection, and various other domains. A common method for GAD
+is Graph Auto-Encoders (GAEs), which encode graph data into node
+representations and identify anomalies by assessing the reconstruction quality
+of the graphs based on these representations. However, existing GAE models are
+primarily optimized for direct link reconstruction, resulting in nodes
+connected in the graph being clustered in the latent space. As a result, they
+excel at detecting cluster-type structural anomalies but struggle with more
+complex structural anomalies that do not conform to clusters. To address this
+limitation, we propose a novel solution called GAD-NR, a new variant of GAE
+that incorporates neighborhood reconstruction for graph anomaly detection.
+GAD-NR aims to reconstruct the entire neighborhood of a node, encompassing the
+local structure, self-attributes, and neighbor attributes, based on the
+corresponding node representation. By comparing the neighborhood reconstruction
+loss between anomalous nodes and normal nodes, GAD-NR can effectively detect
+any anomalies. Extensive experimentation conducted on six real-world datasets
+validates the effectiveness of GAD-NR, showcasing significant improvements (by
+up to 30% in AUC) over state-of-the-art competitors. The source code for GAD-NR
+is openly available. Importantly, the comparative analysis reveals that the
+existing methods perform well only in detecting one or two types of anomalies
+out of the three types studied. In contrast, GAD-NR excels at detecting all
+three types of anomalies across the datasets, demonstrating its comprehensive
+anomaly detection capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-term Forecasting with TiDE: Time-series Dense Encoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhimanyu Das, Weihao Kong, Andrew Leach, Shaan Mathur, Rajat Sen, Rose Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that simple linear models can outperform several
+Transformer based approaches in long term time-series forecasting. Motivated by
+this, we propose a Multi-layer Perceptron (MLP) based encoder-decoder model,
+Time-series Dense Encoder (TiDE), for long-term time-series forecasting that
+enjoys the simplicity and speed of linear models while also being able to
+handle covariates and non-linear dependencies. Theoretically, we prove that the
+simplest linear analogue of our model can achieve near optimal error rate for
+linear dynamical systems (LDS) under some assumptions. Empirically, we show
+that our method can match or outperform prior approaches on popular long-term
+time-series forecasting benchmarks while being 5-10x faster than the best
+Transformer based model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Based Multi-Agent RL in Zero-Sum Markov Games with Near-Optimal
+  Sample Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2007.07461v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2007.07461v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiqing Zhang, Sham M. Kakade, Tamer Başar, Lin F. Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based reinforcement learning (RL), which finds an optimal policy using
+an empirical model, has long been recognized as one of the corner stones of RL.
+It is especially suitable for multi-agent RL (MARL), as it naturally decouples
+the learning and the planning phases, and avoids the non-stationarity problem
+when all agents are improving their policies simultaneously using samples.
+Though intuitive and widely-used, the sample complexity of model-based MARL
+algorithms has not been fully investigated. In this paper, our goal is to
+address the fundamental question about its sample complexity. We study arguably
+the most basic MARL setting: two-player discounted zero-sum Markov games, given
+only access to a generative model. We show that model-based MARL achieves a
+sample complexity of $\tilde O(|S||A||B|(1-\gamma)^{-3}\epsilon^{-2})$ for
+finding the Nash equilibrium (NE) value up to some $\epsilon$ error, and the
+$\epsilon$-NE policies with a smooth planning oracle, where $\gamma$ is the
+discount factor, and $S,A,B$ denote the state space, and the action spaces for
+the two agents. We further show that such a sample bound is minimax-optimal (up
+to logarithmic factors) if the algorithm is reward-agnostic, where the
+algorithm queries state transition samples without reward knowledge, by
+establishing a matching lower bound. This is in contrast to the usual
+reward-aware setting, with a
+$\tilde\Omega(|S|(|A|+|B|)(1-\gamma)^{-3}\epsilon^{-2})$ lower bound, where
+this model-based approach is near-optimal with only a gap on the $|A|,|B|$
+dependence. Our results not only demonstrate the sample-efficiency of this
+basic model-based approach in MARL, but also elaborate on the fundamental
+tradeoff between its power (easily handling the more challenging
+reward-agnostic case) and limitation (less adaptive and suboptimal in
+$|A|,|B|$), particularly arises in the multi-agent context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated version accepted to Journal of Machine Learning Research
+  (JMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilled Pruning: Using Synthetic Data to Win the Lottery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03364v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03364v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luke McDermott, Daniel Cummings
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces a novel approach to pruning deep learning models by
+using distilled data. Unlike conventional strategies which primarily focus on
+architectural or algorithmic optimization, our method reconsiders the role of
+data in these scenarios. Distilled datasets capture essential patterns from
+larger datasets, and we demonstrate how to leverage this capability to enable a
+computationally efficient pruning process. Our approach can find sparse,
+trainable subnetworks (a.k.a. Lottery Tickets) up to 5x faster than Iterative
+Magnitude Pruning at comparable sparsity on CIFAR-10. The experimental results
+highlight the potential of using distilled data for resource-efficient neural
+network pruning, model compression, and neural architecture search.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DOCTOR: A Multi-Disease Detection Continual Learning Framework Based on
+  Wearable Medical Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Hao Li, Niraj K. Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern advances in machine learning (ML) and wearable medical sensors (WMSs)
+in edge devices have enabled ML-driven disease detection for smart healthcare.
+Conventional ML-driven disease detection methods rely on customizing individual
+models for each disease and its corresponding WMS data. However, such methods
+lack adaptability to distribution shifts and new task classification classes.
+Moreover, they need to be rearchitected and retrained from scratch for each new
+disease. To address these challenges, we propose DOCTOR, a multi-disease
+detection continual learning (CL) framework based on WMSs. It employs a
+multi-headed deep neural network (DNN) and an exemplar-replay-style CL
+algorithm. The CL algorithm enables the framework to continually learn new
+missions where different data distributions, classification classes, and
+disease detection tasks are introduced sequentially. It counteracts
+catastrophic forgetting with a data preservation method and a synthetic data
+generation (SDG) module. The data preservation method efficiently preserves the
+most informative subset of training data from previous missions for replay. The
+SDG module models the probability distribution of the real training data and
+generates synthetic data for replays while retaining data privacy. The
+multi-headed DNN enables DOCTOR to detect multiple diseases simultaneously
+based on user WMS data. In various CL experiments, we demonstrate DOCTOR's
+efficacy in maintaining high disease classification accuracy with a single DNN
+model. DOCTOR achieves 1.43 times better average test accuracy, 1.25 times
+better F1-score, and 0.41 higher backward transfer than the naive fine-tuning
+framework, with a small model size and in complex CL scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 7 figures. This work has been submitted to the IEEE for
+  possible publication. Copyright may be transferred without notice, after
+  which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Actor-agnostic Multi-label Action Recognition with Multi-modal Query <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Mondal, Sauradip Nag, Joaquin M Prada, Xiatian Zhu, Anjan Dutta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing action recognition methods are typically actor-specific due to the
+intrinsic topological and apparent differences among the actors. This requires
+actor-specific pose estimation (e.g., humans vs. animals), leading to
+cumbersome model design complexity and high maintenance costs. Moreover, they
+often focus on learning the visual modality alone and single-label
+classification whilst neglecting other available information sources (e.g.,
+class name text) and the concurrent occurrence of multiple actions. To overcome
+these limitations, we propose a new approach called 'actor-agnostic multi-modal
+multi-label action recognition,' which offers a unified solution for various
+types of actors, including humans and animals. We further formulate a novel
+Multi-modal Semantic Query Network (MSQNet) model in a transformer-based object
+detection framework (e.g., DETR), characterized by leveraging visual and
+textual modalities to represent the action classes better. The elimination of
+actor-specific model designs is a key advantage, as it removes the need for
+actor pose estimation altogether. Extensive experiments on five publicly
+available benchmarks show that our MSQNet consistently outperforms the prior
+arts of actor-specific alternatives on human and animal single- and multi-label
+action recognition tasks by up to 50%. Code will be released at
+https://github.com/mondalanindya/MSQNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 IEEE/CVF International Conference on Computer
+  Vision Workshops (ICCVW), Paris, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying Causes of Arctic Amplification via Deep Learning based
+  Time-series Causal Inference <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07122v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07122v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahara Ali, Omar Faruque, Yiyi Huang, Md. Osman Gani, Aneesh Subramanian, Nicole-Jienne Shchlegel, Jianwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The warming of the Arctic, also known as Arctic amplification, is led by
+several atmospheric and oceanic drivers. However, the details of its underlying
+thermodynamic causes are still unknown. Inferring the causal effects of
+atmospheric processes on sea ice melt using fixed treatment effect strategies
+leads to unrealistic counterfactual estimations. Such models are also prone to
+bias due to time-varying confoundedness. Further, the complex non-linearity in
+Earth science data makes it infeasible to perform causal inference using
+existing marginal structural techniques. In order to tackle these challenges,
+we propose TCINet - time-series causal inference model to infer causation under
+continuous treatment using recurrent neural networks and a novel probabilistic
+balancing technique. Through experiments on synthetic and observational data,
+we show how our research can substantially improve the ability to quantify
+leading causes of Arctic sea ice melt, further paving paths for causal
+inference in observational Earth science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In review at IEEE ICMLA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cost-Effective Hyperparameter Optimization for Large Language Model
+  Generation Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have sparked significant interest in their
+generative capabilities, leading to the development of various commercial
+applications. The high cost of using the models drives application builders to
+maximize the value of generation under a limited inference budget. This paper
+presents a study of optimizing inference hyperparameters such as the number of
+responses, temperature and max tokens, which significantly affects the
+utility/cost of text generation. We design a framework named EcoOptiGen which
+leverages economical hyperparameter optimization and cost-based pruning.
+Experiments with the GPT-3.5/GPT-4 models on a variety of tasks verify its
+effectiveness. EcoOptiGen is implemented in the `autogen' package of the FLAML
+library: \url{https://aka.ms/autogen}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support
+  <span class="highlight-title">Transformer</span> for Frame-Event based Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Zongzhen Wu, Yao Rong, Lin Zhu, Bo Jiang, Jin Tang, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event camera-based pattern recognition is a newly arising research topic in
+recent years. Current researchers usually transform the event streams into
+images, graphs, or voxels, and adopt deep neural networks for event-based
+classification. Although good performance can be achieved on simple event
+recognition datasets, however, their results may be still limited due to the
+following two issues. Firstly, they adopt spatial sparse event streams for
+recognition only, which may fail to capture the color and detailed texture
+information well. Secondly, they adopt either Spiking Neural Networks (SNN) for
+energy-efficient recognition with suboptimal results, or Artificial Neural
+Networks (ANN) for energy-intensive, high-performance recognition. However,
+seldom of them consider achieving a balance between these two aspects. In this
+paper, we formally propose to recognize patterns by fusing RGB frames and event
+streams simultaneously and propose a new RGB frame-event recognition framework
+to address the aforementioned issues. The proposed method contains four main
+modules, i.e., memory support Transformer network for RGB frame encoding,
+spiking neural network for raw event stream encoding, multi-modal bottleneck
+fusion module for RGB-Event feature aggregation, and prediction head. Due to
+the scarce of RGB-Event based classification dataset, we also propose a
+large-scale PokerEvent dataset which contains 114 classes, and 27102
+frame-event pairs recorded using a DVS346 event camera. Extensive experiments
+on two RGB-Event based classification datasets fully validated the
+effectiveness of our proposed framework. We hope this work will boost the
+development of pattern recognition by fusing RGB frames and event streams. Both
+our dataset and source code of this work will be released at
+https://github.com/Event-AHU/SSTFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Edge Caching: a Meta Reinforcement Learning Approach with
+  Edge Sampling <span class="chip">ICME2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowei He, Yinan Mao, Shiji Zhou, Chen Ma, Zhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current learning-based edge caching schemes usually suffer from dynamic
+content popularity, e.g., in the emerging short video platforms, users' request
+patterns shift significantly over time and across different edges. An intuitive
+solution for a specific local edge cache is to collect more request histories
+from other edge caches. However, uniformly merging these request histories may
+not perform satisfactorily due to heterogeneous content distributions on
+different edges. To solve this problem, we propose a collaborative edge caching
+framework. First, we design a meta-learning-based collaborative strategy to
+guarantee that the local model can timely meet the continually changing content
+popularity. Then, we design an edge sampling method to select more "valuable"
+neighbor edges to participate in the local training. To evaluate the proposed
+framework, we conduct trace-driven experiments to demonstrate the effectiveness
+of our design: it improves the average cache hit rate by up to $10.12\%$
+(normalized) compared with other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on IEEE International Conference on Multimedia and Expo
+  2023 (ICME2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huilin Zhang, Sumei Li, Yongli Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereoscopic image quality assessment (SIQA) plays a crucial role in
+evaluating and improving the visual experience of 3D content. Existing
+binocular properties and attention-based methods for SIQA have achieved
+promising performance. However, these bottom-up approaches are inadequate in
+exploiting the inherent characteristics of the human visual system (HVS). This
+paper presents a novel network for SIQA via stereo attention, employing a
+top-down perspective to guide the quality assessment process. Our proposed
+method realizes the guidance from high-level binocular signals down to
+low-level monocular signals, while the binocular and monocular information can
+be calibrated progressively throughout the processing pipeline. We design a
+generalized Stereo AttenTion (SAT) block to implement the top-down philosophy
+in stereo perception. This block utilizes the fusion-generated attention map as
+a high-level binocular modulator, influencing the representation of two
+low-level monocular features. Additionally, we introduce an Energy Coefficient
+(EC) to account for recent findings indicating that binocular responses in the
+primate primary visual cortex are less than the sum of monocular responses. The
+adaptive EC can tune the magnitude of binocular response flexibly, thus
+enhancing the formation of robust binocular features within our framework. To
+extract the most discriminative quality information from the summation and
+subtraction of the two branches of monocular features, we utilize a
+dual-pooling strategy that applies min-pooling and max-pooling operations to
+the respective branches. Experimental results highlight the superiority of our
+top-down method in simulating the property of visual perception and advancing
+the state-of-the-art in the SIQA field. The code of this work is available at
+https://github.com/Fanning-Zhang/SATNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Adaptive Video Streaming with Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchi Huang, Rui-Xiao Zhang, Chenglei Wu, Lifeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality of Experience~(QoE)-driven adaptive bitrate~(ABR) algorithms are
+typically optimized using QoE models that are based on the mean opinion
+score~(MOS), while such principles may not account for user heterogeneity on
+rating scales, resulting in unexpected behaviors. In this paper, we propose
+\texttt{Jade}, which leverages reinforcement learning with human
+feedback~(RLHF) technologies to better align the users' opinion scores.
+\texttt{Jade}'s rank-based QoE model considers relative values of user ratings
+to interpret the subjective perception of video sessions. We implement
+linear-based and Deep Neural Network (DNN)-based architectures for satisfying
+both accuracy and generalization ability. We further propose entropy-aware
+reinforced mechanisms for training policies with the integration of the
+proposed QoE models. Experimental results demonstrate that \texttt{Jade}
+performs favorably on conventional metrics, such as quality and stall ratio,
+and improves QoE by 8.09\%-38.13\% in different network conditions, emphasizing
+the importance of user heterogeneity in QoE modeling and the potential of
+combining linear-based and DNN-based models for performance improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Color Recommendation in Vector Graphic Documents <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianru Qiu, Xueting Wang, Mayu Otani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color selection plays a critical role in graphic document design and requires
+sufficient consideration of various contexts. However, recommending appropriate
+colors which harmonize with the other colors and textual contexts in documents
+is a challenging task, even for experienced designers. In this study, we
+propose a multimodal masked color model that integrates both color and textual
+contexts to provide text-aware color recommendation for graphic documents. Our
+proposed model comprises self-attention networks to capture the relationships
+between colors in multiple palettes, and cross-attention networks that
+incorporate both color and CLIP-based text representations. Our proposed method
+primarily focuses on color palette completion, which recommends colors based on
+the given colors and text. Additionally, it is applicable for another color
+recommendation task, full palette generation, which generates a complete color
+palette corresponding to the given text. Experimental results demonstrate that
+our proposed approach surpasses previous color palette completion methods on
+accuracy, color distribution, and user experience, as well as full palette
+generation methods concerning color diversity and similarity to the ground
+truth palettes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSAC: Multiple Speech Attribute Control Method for Speech Emotion
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant progress, speech emotion recognition (SER) remains
+challenging due to inherent complexity and ambiguity of the emotion attribute,
+particularly in wild world. Whereas current studies primarily focus on
+recognition and generalization capabilities, this work pioneers an exploration
+into the reliability of SER methods and investigates how to model the speech
+emotion from the aspect of data distribution across various speech attributes.
+Specifically, we first build a novel CNN-based SER model which adopts additive
+margin softmax loss to expand the distance between features of different
+classes, thereby enhancing their discrimination. Second, a novel multiple
+speech attribute control method MSAC is proposed to explicitly control speech
+attributes, enabling the model to be less affected by emotion-agnostic
+attributes and capture more fine-grained emotion-related features. Third, we
+make a first attempt to test and analyze the reliability of the proposed SER
+workflow using the out-of-distribution detection method. Extensive experiments
+on both single and cross-corpus SER scenarios show that our proposed unified
+SER workflow consistently outperforms the baseline in terms of recognition,
+generalization, and reliability performance. Besides, in single-corpus SER, the
+proposed SER workflow achieves superior recognition results with a WAR of
+72.97\% and a UAR of 71.76\% on the IEMOCAP corpus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning based Image Watermarking: A Brief <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhong, Arjon Das, Fahad Alrasheedi, Abdullah Tanvir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The act of secretly embedding and extracting a watermark on a cover image to
+protect it is known as image watermarking. In recent years, deep learning-based
+image watermarking techniques have been emerging one after another. To study
+the state-of-the-art, this survey categorizes cutting-edge deep learning-based
+image watermarking techniques into Embedder-Extractor Joint Training, Deep
+Networks as a Feature Transformation, and Hybrid schemes. Research directions
+in each category are also analyzed and summarized. Additionally, potential
+future research directions are discussed to envision future studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Diverse Data Types Steganalysis: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Kheddar, Mustapha Hemis, Yassine Himeur, David Megías, Abbes Amira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Steganography and steganalysis are two interrelated aspects of the field of
+information security. Steganography seeks to conceal communications, whereas
+steganalysis is aimed to either find them or even, if possible, recover the
+data they contain. Steganography and steganalysis have attracted a great deal
+of interest, particularly from law enforcement. Steganography is often used by
+cybercriminals and even terrorists to avoid being captured while in possession
+of incriminating evidence, even encrypted, since cryptography is prohibited or
+restricted in many countries. Therefore, knowledge of cutting-edge techniques
+to uncover concealed information is crucial in exposing illegal acts. Over the
+last few years, a number of strong and reliable steganography and steganalysis
+techniques have been introduced in the literature. This review paper provides a
+comprehensive overview of deep learning-based steganalysis techniques used to
+detect hidden information within digital media. The paper covers all types of
+cover in steganalysis, including image, audio, and video, and discusses the
+most commonly used deep learning techniques. In addition, the paper explores
+the use of more advanced deep learning techniques, such as deep transfer
+learning (DTL) and deep reinforcement learning (DRL), to enhance the
+performance of steganalysis systems. The paper provides a systematic review of
+recent research in the field, including data sets and evaluation metrics used
+in recent studies. It also presents a detailed analysis of DTL-based
+steganalysis approaches and their performance on different data sets. The
+review concludes with a discussion on the current state of deep learning-based
+steganalysis, challenges, and future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maoxun Yuan, Tianyi Zhao, Bo Li, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening, as one of the most commonly used techniques in remote sensing
+systems, aims to inject spatial details from panchromatic images into
+multispectral images (MS) to obtain high-resolution multispectral images. Since
+deep learning has received widespread attention because of its powerful fitting
+ability and efficient feature extraction, a variety of pan-sharpening methods
+have been proposed to achieve remarkable performance. However, current
+pan-sharpening methods usually require the paired panchromatic (PAN) and MS
+images as input, which limits their usage in some scenarios. To address this
+issue, in this paper we observe that the spatial details from PAN images are
+mainly high-frequency cues, i.e., the edges reflect the contour of input PAN
+images. This motivates us to develop a PAN-agnostic representation to store
+some base edges, so as to compose the contour for the corresponding PAN image
+via them. As a result, we can perform the pan-sharpening task with only the MS
+image when inference. To this end, a memory-based network is adapted to extract
+and memorize the spatial details during the training phase and is used to
+replace the process of obtaining spatial information from PAN images when
+inference, which is called Memory-based Spatial Details Network (MSDN).
+Finally, we integrate the proposed MSDN module into the existing deep
+learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening
+network. With extensive experiments on the Gaofen1 and WorldView-4 satellites,
+we verify that our method constructs good spatial details without PAN images
+and achieves the best performance. The code is available at
+https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Yanni Hu, Yuguang Yang, Jixun Yao, Wen Fei, Lei Ma, Heng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining approaches have
+recently exhibited impressive success in diverse fields. In this paper, we
+propose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive
+language-audio pretraining (CLAP) method for speech emotion recognition.
+Specifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing
+pre-trained WavLM and RoBERTa models. Second, given the significance of the
+gender attribute in speech emotion modeling, two novel soft label based
+GEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)
+models are further proposed to integrate emotion and gender information of
+speech signals, forming more reasonable objectives. Extensive experiments on
+IEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the
+baseline Emo-CLAP, while also achieving the best recognition performance
+compared with recent state-of-the-art methods. Noticeably, the proposed
+SL-GEmo-CLAP model achieves the best UAR of 81.43\% and WAR of 83.16\% which
+performs better than other state-of-the-art SER methods by at least 3\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAE-DFER: Efficient Masked Autoencoder for <span class="highlight-title">Self-supervised</span> Dynamic
+  Facial Expression Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licai Sun, Zheng Lian, Bin Liu, Jianhua Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (DFER) is essential to the development
+of intelligent and empathetic machines. Prior efforts in this field mainly fall
+into supervised learning paradigm, which is severely restricted by the limited
+labeled data in existing datasets. Inspired by recent unprecedented success of
+masked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel
+self-supervised method which leverages large-scale self-supervised pre-training
+on abundant unlabeled data to largely advance the development of DFER. Since
+the vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial
+computation during fine-tuning, MAE-DFER develops an efficient local-global
+interaction Transformer (LGI-Former) as the encoder. Moreover, in addition to
+the standalone appearance content reconstruction in VideoMAE, MAE-DFER also
+introduces explicit temporal facial motion modeling to encourage LGI-Former to
+excavate both static appearance and dynamic motion information. Extensive
+experiments on six datasets show that MAE-DFER consistently outperforms
+state-of-the-art supervised methods by significant margins (e.g., +6.30\% UAR
+on DFEW and +8.34\% UAR on MAFW), verifying that it can learn powerful dynamic
+facial representations via large-scale self-supervised pre-training. Besides,
+it has comparable or even better performance than VideoMAE, while largely
+reducing the computational cost (about 38\% FLOPs). We believe MAE-DFER has
+paved a new way for the advancement of DFER and can inspire more relevant
+research in this field and even other related tasks. Codes and models are
+publicly available at https://github.com/sunlicai/MAE-DFER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 (camera ready). Codes and models are publicly available
+  at https://github.com/sunlicai/MAE-DFER</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-07T00:00:00Z">2023-08-07</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">70</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What about translation? New coding system for content analysis on the
+  perception of literary translation around the political transformation in
+  1989 in Hungary as a classification problem on an unbalanced <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dalma Galambos, Pál Zsámboki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To track trends in the perception of literary translation around the
+political transformation in 1989 in Hungary, a coding system was developed on
+the paragraphs of the 1980-1999 issues of the literary journal Alf\"old. This
+paper describes how we trained BERT models to carry over the coding system to
+the 1980-1999 issues of the literary journal Nagyvil\'ag. We use extensive
+hyperparameter tuning, loss functions robust to label unbalance, 10-fold
+cross-validation for precise evaluations and a model ensemble for prediction,
+manual validation on the predict set, a new calibration method to better
+predict label counts for sections of the Nagyvil\'ag corpus, and to study the
+relations between labels, we construct label relation networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBench: Evaluating LLMs as Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming increasingly smart and autonomous,
+targeting real-world pragmatic missions beyond traditional NLP tasks. As a
+result, there has been an urgent need to evaluate LLMs as agents on challenging
+tasks in interactive environments. We present AgentBench, a multi-dimensional
+evolving benchmark that currently consists of 8 distinct environments to assess
+LLM-as-Agent's reasoning and decision-making abilities in a multi-turn
+open-ended generation setting. Our extensive test over 25 LLMs (including APIs
+and open-sourced models) shows that, while top commercial LLMs present a strong
+ability of acting as agents in complex environments, there is a significant
+disparity in performance between them and open-sourced competitors. It also
+serves as a component of an ongoing project with wider coverage and deeper
+consideration towards systematic LLM evaluation. Datasets, environments, and an
+integrated evaluation package for AgentBench are released at
+https://github.com/THUDM/AgentBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Spells in Fantasy Literature with a <span class="highlight-title">Transformer</span> Based
+  Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Moravek, Alexander Zender, Andreas Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer architectures and models have made significant progress in
+language-based tasks. In this area, is BERT one of the most widely used and
+freely available transformer architecture. In our work, we use BERT for
+context-based phrase recognition of magic spells in the Harry Potter novel
+series. Spells are a common part of active magic in fantasy novels. Typically,
+spells are used in a specific context to achieve a supernatural effect. A
+series of investigations were conducted to see if a Transformer architecture
+could recognize such phrases based on their context in the Harry Potter saga.
+For our studies a pre-trained BERT model was used and fine-tuned utilising
+different datasets and training methods to identify the searched context. By
+considering different approaches for sequence classification as well as token
+classification, it is shown that the context of spells can be recognised.
+According to our investigations, the examined sequence length for fine-tuning
+and validation of the model plays a significant role in context recognition.
+Based on this, we have investigated whether spells have overarching properties
+that allow a transfer of the neural network models to other fantasy universes
+as well. The application of our model showed promising results and is worth to
+be deepened in subsequent studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using
+  EmotionBench 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen-tse Huang, Man Ho Lam, Eric John Li, Shujie Ren, Wenxuan Wang, Wenxiang Jiao, Zhaopeng Tu, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the community has witnessed the advancement of Large Language
+Models (LLMs), which have shown remarkable performance on various downstream
+tasks. Led by powerful models like ChatGPT and Claude, LLMs are revolutionizing
+how users engage with software, assuming more than mere tools but intelligent
+assistants. Consequently, evaluating LLMs' anthropomorphic capabilities becomes
+increasingly important in contemporary discourse. Utilizing the emotion
+appraisal theory from psychology, we propose to evaluate the empathy ability of
+LLMs, i.e., how their feelings change when presented with specific situations.
+After a careful and comprehensive survey, we collect a dataset containing over
+400 situations that have proven effective in eliciting the eight emotions
+central to our study. Categorizing the situations into 36 factors, we conduct a
+human evaluation involving more than 1,200 subjects worldwide. With the human
+evaluation results as references, our evaluation includes five LLMs, covering
+both commercial and open-source models, including variations in model sizes,
+featuring the latest iterations, such as GPT-4 and LLaMA 2. A conclusion can be
+drawn from the results that, despite several misalignments, LLMs can generally
+respond appropriately to certain situations. Nevertheless, they fall short in
+alignment with the emotional behaviors of human beings and cannot establish
+connections between similar situations. Our collected dataset of situations,
+the human evaluation results, and the code of our testing framework, dubbed
+EmotionBench, is made publicly in https://github.com/CUHK-ARISE/EmotionBench.
+We aspire to contribute to the advancement of LLMs regarding better alignment
+with the emotional behaviors of human beings, thereby enhancing their utility
+and applicability as intelligent assistants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KITLM: Domain-Specific Knowledge InTegration into Language Models for
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Agarwal, Sakharam Gawade, Amar Prakash Azad, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable performance in a
+wide range of natural language tasks. However, as these models continue to grow
+in size, they face significant challenges in terms of computational costs.
+Additionally, LLMs often lack efficient domain-specific understanding, which is
+particularly crucial in specialized fields such as aviation and healthcare. To
+boost the domain-specific understanding, we propose, KITLM, a novel knowledge
+base integration approach into language model through relevant information
+infusion. By integrating pertinent knowledge, not only the performance of the
+language model is greatly enhanced, but the model size requirement is also
+significantly reduced while achieving comparable performance. Our proposed
+knowledge-infused model surpasses the performance of both GPT-3.5-turbo and the
+state-of-the-art knowledge infusion method, SKILL, achieving over 1.5 times
+improvement in exact match scores on the MetaQA. KITLM showed a similar
+performance boost in the aviation domain with AeroQA. The drastic performance
+improvement of KITLM over the existing methods can be attributed to the
+infusion of relevant knowledge while mitigating noise. In addition, we release
+two curated datasets to accelerate knowledge infusion research in specialized
+fields: a) AeroQA, a new benchmark dataset designed for multi-hop
+question-answering within the aviation domain, and b) Aviation Corpus, a
+dataset constructed from unstructured text extracted from the National
+Transportation Safety Board reports. Our research contributes to advancing the
+field of domain-specific language understanding and showcases the potential of
+knowledge infusion techniques in improving the performance of language models
+on question-answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Negative Lexical Constraints in Neural Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Jon, Dušan Variš, Michal Novák, João Paulo Aires, Ondřej Bojar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores negative lexical constraining in English to Czech neural
+machine translation. Negative lexical constraining is used to prohibit certain
+words or expressions in the translation produced by the neural translation
+model. We compared various methods based on modifying either the decoding
+process or the training data. The comparison was performed on two tasks:
+paraphrasing and feedback-based translation refinement. We also studied to
+which extent these methods "evade" the constraints presented to the model
+(usually in the dictionary form) by generating a different surface form of a
+given constraint.We propose a way to mitigate the issue through training with
+stemmed negative constraints to counter the model's ability to induce a variety
+of the surface forms of a word that can result in bypassing the constraint. We
+demonstrate that our method improves the constraining, although the problem
+still persists in many cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsuvas Borkakoty, Luis Espinosa-Anke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental challenge in the current NLP context, dominated by language
+models, comes from the inflexibility of current architectures to 'learn' new
+information. While model-centric solutions like continual learning or
+parameter-efficient fine tuning are available, the question still remains of
+how to reliably identify changes in language or in the world. In this paper, we
+propose WikiTiDe, a dataset derived from pairs of timestamped definitions
+extracted from Wikipedia. We argue that such resource can be helpful for
+accelerating diachronic NLP, specifically, for training models able to scan
+knowledge resources for core updates concerning a concept, an event, or a named
+entity. Our proposed end-to-end method is fully automatic, and leverages a
+bootstrapping algorithm for gradually creating a high-quality dataset. Our
+results suggest that bootstrapping the seed version of WikiTiDe leads to better
+fine-tuned models. We also leverage fine-tuned models in a number of downstream
+tasks, showing promising results with respect to competitive baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RANLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Controllable Natural Language Inference through Lexical
+  Inference Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingji Zhang, Danilo S. Carvalho, Ian Pratt-Hartmann, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable natural language inference aims to provide a mechanism to produce
+explanatory (abductive) inference chains which ground claims to their
+supporting premises. A recent corpus called EntailmentBank strives to advance
+this task by explaining the answer to a question using an entailment tree
+\cite{dalvi2021explaining}. They employ the T5 model to directly generate the
+tree, which can explain how the answer is inferred. However, it lacks the
+ability to explain and control the generation of intermediate steps, which is
+crucial for the multi-hop inference process. % One recent corpus,
+EntailmentBank, aims to push this task forward by explaining an answer to a
+question according to an entailment tree \cite{dalvi2021explaining}. They
+employ T5 to generate the tree directly, which can explain how the answer is
+inferred but cannot explain how the intermediate is generated, which is
+essential to the multi-hop inference process. In this work, we focus on
+proposing a controlled natural language inference architecture for
+multi-premise explanatory inference. To improve control and enable explanatory
+analysis over the generation, we define lexical inference types based on
+Abstract Meaning Representation (AMR) graph and modify the architecture of T5
+to learn a latent sentence representation (T5 bottleneck) conditioned on said
+type information. We also deliver a dataset of approximately 5000 annotated
+explanatory inference steps, with well-grounded lexical-symbolic operations.
+Experimental results indicate that the inference typing induced at the T5
+bottleneck can help T5 to generate a conclusion under explicit control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topological Interpretations of <span class="highlight-title">GPT</span>-3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Sun, Bradley Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is an experiential study of investigating a consistent method for
+deriving the correlation between sentence vector and semantic meaning of a
+sentence. We first used three state-of-the-art word/sentence embedding methods
+including GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence
+strings into high dimensional spaces. Then we compute the pairwise distance
+between any possible combination of two sentence vectors in an embedding space
+and map them into a matrix. Based on each distance matrix, we compute the
+correlation of distances of a sentence vector with respect to the other
+sentence vectors in an embedding space. Then we compute the correlation of each
+pair of the distance matrices. We observed correlations of the same sentence in
+different embedding spaces and correlations of different sentences in the same
+embedding space. These observations are consistent with our hypothesis and take
+us to the next stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mondrian: <span class="highlight-title">Prompt</span> Abstraction Attack Against Large Language Models for
+  Cheaper API Pricing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wai Man Si, Michael Backes, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Machine Learning as a Service (MLaaS) market is rapidly expanding and
+becoming more mature. For example, OpenAI's ChatGPT is an advanced large
+language model (LLM) that generates responses for various queries with
+associated fees. Although these models can deliver satisfactory performance,
+they are far from perfect. Researchers have long studied the vulnerabilities
+and limitations of LLMs, such as adversarial attacks and model toxicity.
+Inevitably, commercial ML models are also not exempt from such issues, which
+can be problematic as MLaaS continues to grow. In this paper, we discover a new
+attack strategy against LLM APIs, namely the prompt abstraction attack.
+Specifically, we propose Mondrian, a simple and straightforward method that
+abstracts sentences, which can lower the cost of using LLM APIs. In this
+approach, the adversary first creates a pseudo API (with a lower established
+price) to serve as the proxy of the target API (with a higher established
+price). Next, the pseudo API leverages Mondrian to modify the user query,
+obtain the abstracted response from the target API, and forward it back to the
+end user. Our results show that Mondrian successfully reduces user queries'
+token length ranging from 13% to 23% across various tasks, including text
+classification, generation, and question answering. Meanwhile, these abstracted
+queries do not significantly affect the utility of task-specific and general
+language models like ChatGPT. Mondrian also reduces instruction prompts' token
+length by at least 11% without compromising output quality. As a result, the
+prompt abstraction attack enables the adversary to profit without bearing the
+cost of API development and deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language
+  Model through Expert Feedback and Real-world Multi-turn Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songhua Yang, Hanjia Zhao, Senbin Zhu, Guangyu Zhou, Hongfei Xu, Yuxiang Jia, Hongying Zan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) have achieved remarkable
+breakthroughs in understanding and responding to user intents. However, their
+performance lag behind general use cases in some expertise domains, such as
+Chinese medicine. Existing efforts to incorporate Chinese medicine into LLMs
+rely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue
+data. These models lack the ability for doctor-like proactive inquiry and
+multi-turn comprehension and cannot always align responses with safety and
+professionalism experts. In this work, we introduce Zhongjing, the first
+Chinese medical LLaMA-based LLM that implements an entire training pipeline
+from pre-training to reinforcement learning with human feedback (RLHF).
+Additionally, we introduce a Chinese multi-turn medical dialogue dataset of
+70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly
+enhances the model's capability for complex dialogue and proactive inquiry
+initiation. We define a refined annotation rule and evaluation criteria given
+the biomedical domain's unique characteristics. Results show that our model
+outperforms baselines in various capacities and matches the performance of
+ChatGPT in a few abilities, despite having 50x training data with previous best
+model and 100x parameters with ChatGPT. RLHF further improves the model's
+instruction-following ability and safety. We also release our code, datasets
+and model for further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring Variety, Balance, and Disparity: An Analysis of Media Coverage
+  of the 2021 German Federal Election 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Färber, Jannik Schwade, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining and measuring diversity in news articles is important for a
+number of reasons, including preventing filter bubbles and fueling public
+discourse, especially before elections. So far, the identification and analysis
+of diversity have been illuminated in a variety of ways, such as measuring the
+overlap of words or topics between news articles related to US elections.
+However, the question of how diversity in news articles can be measured
+holistically, i.e., with respect to (1) variety, (2) balance, and (3)
+disparity, considering individuals, parties, and topics, has not been
+addressed. In this paper, we present a framework for determining diversity in
+news articles according to these dimensions. Furthermore, we create and provide
+a dataset of Google Top Stories, encompassing more than 26,000 unique headlines
+from more than 900 news outlets collected within two weeks before and after the
+2021 German federal election. While we observe high diversity for more general
+search terms (e.g., "election"), a range of search terms ("education,"
+"Europe," "climate protection," "government") resulted in news articles with
+high diversity in two out of three dimensions. This reflects a more subjective,
+dedicated discussion on rather future-oriented topics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vocab-Expander: A System for Creating Domain-Specific Vocabularies Based
+  on Word Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Färber, Nicholas Popovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose Vocab-Expander at https://vocab-expander.com, an
+online tool that enables end-users (e.g., technology scouts) to create and
+expand a vocabulary of their domain of interest. It utilizes an ensemble of
+state-of-the-art word embedding techniques based on web text and ConceptNet, a
+common-sense knowledge base, to suggest related terms for already given terms.
+The system has an easy-to-use interface that allows users to quickly confirm or
+reject term suggestions. Vocab-Expander offers a variety of potential use
+cases, such as improving concept-based information retrieval in technology and
+innovation management, enhancing communication and collaboration within
+organizations or interdisciplinary projects, and creating vocabularies for
+specific courses in education.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at RANLP'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-preserving Pruning for <span class="highlight-title">Pre-train</span>ed Language Models without
+  Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungcheol Park, Hojun Choi, U Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a pre-trained language model, how can we efficiently compress it
+without retraining? Retraining-free structured pruning algorithms are crucial
+in pre-trained language model compression due to their significantly reduced
+pruning cost and capability to prune large language models. However, existing
+retraining-free algorithms encounter severe accuracy degradation, as they fail
+to preserve the useful knowledge of pre-trained models. In this paper, we
+propose K-pruning (Knowledge-preserving pruning), an accurate retraining-free
+structured pruning algorithm for pre-trained language models. K-pruning
+identifies and prunes attention heads and neurons deemed to be superfluous,
+based on the amount of their inherent knowledge. K-pruning applies an iterative
+process of pruning followed by knowledge reconstruction for each sub-layer to
+preserve the knowledge of the pre-trained models. Consequently, K-pruning shows
+up to 58.02%p higher F1 score than existing retraining-free pruning algorithms
+under a high compression rate of 80% on the SQuAD benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RCMHA: Relative Convolutional Multi-Head Attention for Natural Language
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Herman Sugiharto,  Aradea, Husni Mubarok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Attention module finds common usage in language modeling, presenting
+distinct challenges within the broader scope of Natural Language Processing.
+Multi-Head Attention (MHA) employs an absolute positional encoding, which
+imposes limitations on token length and entails substantial memory consumption
+during the processing of embedded inputs. The current remedy proposed by
+researchers involves the utilization of relative positional encoding, similar
+to the approach adopted in Transformer-XL or Relative Multi-Head Attention
+(RMHA), albeit the employed architecture consumes considerable memory
+resources. To address these challenges, this study endeavors to refine MHA,
+leveraging relative positional encoding in conjunction with the Depth-Wise
+Convolutional Layer architecture, which promises heightened accuracy coupled
+with minimized memory usage. The proposed RCMHA framework entails the
+modification of two integral components: firstly, the application of the
+Depth-Wise Convolutional Layer to the input embedding, encompassing Query, Key,
+and Value parameters; secondly, the incorporation of Relative Positional
+Encoding into the attention scoring phase, harmoniously integrated with Scaled
+Dot-Product Attention. Empirical experiments underscore the advantages of
+RCMHA, wherein it exhibits superior accuracy, boasting a score of 0.572 in
+comparison to alternative attention modules such as MHA, Multi-DConv-Head
+Attention (MDHA), and RMHA. Concerning memory utilization, RMHA emerges as the
+most frugal, demonstrating an average consumption of 2.98 GB, surpassing RMHA
+which necessitates 3.5 GB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Chinese ASR Error Correction with Dynamic Error Scaling
+  Mechanism <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Fan, Yong Zhang, Hanzhang Li, Jianzong Wang, Zhitao Li, Sheng Ouyang, Ning Cheng, Jing Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chinese Automatic Speech Recognition (ASR) error correction presents
+significant challenges due to the Chinese language's unique features, including
+a large character set and borderless, morpheme-based structure. Current
+mainstream models often struggle with effectively utilizing word-level features
+and phonetic information. This paper introduces a novel approach that
+incorporates a dynamic error scaling mechanism to detect and correct
+phonetically erroneous text generated by ASR output. This mechanism operates by
+dynamically fusing word-level features and phonetic information, thereby
+enriching the model with additional semantic data. Furthermore, our method
+implements unique error reduction and amplification strategies to address the
+issues of matching wrong words caused by incorrect characters. Experimental
+results indicate substantial improvements in ASR error correction,
+demonstrating the effectiveness of our proposed method and yielding promising
+results on established datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 24th Annual Conference of the International Speech
+  Communication Association (INTERSPEECH 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Guided Copy Mechanism for Conversational Question Answering <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Zhang, Zhitao Li, Jianzong Wang, Yiming Gao, Ning Cheng, Fengying Yu, Jing Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational Question Answering (CQA) is a challenging task that aims to
+generate natural answers for conversational flow questions. In this paper, we
+propose a pluggable approach for extractive methods that introduces a novel
+prompt-guided copy mechanism to improve the fluency and appropriateness of the
+extracted answers. Our approach uses prompts to link questions to answers and
+employs attention to guide the copy mechanism to verify the naturalness of
+extracted answers, making necessary edits to ensure that the answers are fluent
+and appropriate. The three prompts, including a question-rationale relationship
+prompt, a question description prompt, and a conversation history prompt,
+enhance the copy mechanism's performance. Our experiments demonstrate that this
+approach effectively promotes the generation of natural answers and achieves
+good results in the CoQA challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 24th Annual Conference of the International Speech
+  Communication Association (INTERSPEECH 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recycle<span class="highlight-title">GPT</span>: An Autoregressive Language Model with Recyclable Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Jiang, Qiaozhi He, Xiaomin Zhuang, Zhihua Wu, Kunpeng Wang, Wenlai Zhao, Guangwen Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing large language models have to run K times to generate a sequence of
+K tokens. In this paper, we present RecycleGPT, a generative language model
+with fast decoding speed by recycling pre-generated model states without
+running the whole model in multiple steps. Our approach relies on the
+observation that adjacent tokens in a sequence usually have strong correlations
+and the next token in a sequence can be reasonably guessed or inferred based on
+the preceding ones. Through theoretical evaluations and practical tests on
+downstream text generation tasks, we demonstrate the effectiveness of our
+approach in lowering inference latency, achieving up to 1.4x speedup while
+preserving high performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Evaluation for Low-Latency Simultaneous Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Huber, Tu Anh Dinh, Carlos Mullov, Ngoc Quan Pham, Thai Binh Nguyen, Fabian Retkowski, Stefan Constantin, Enes Yavuz Ugan, Danni Liu, Zhaolin Li, Sai Koneru, Jan Niehues, Alexander Waibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of low-latency speech translation has recently draw significant
+interest in the research community as shown by several publications and shared
+tasks. Therefore, it is essential to evaluate these different approaches in
+realistic scenarios. However, currently only specific aspects of the systems
+are evaluated and often it is not possible to compare different approaches.
+  In this work, we propose the first framework to perform and evaluate the
+various aspects of low-latency speech translation under realistic conditions.
+The evaluation is carried out in an end-to-end fashion. This includes the
+segmentation of the audio as well as the run-time of the different components.
+  Secondly, we compare different approaches to low-latency speech translation
+using this framework. We evaluate models with the option to revise the output
+as well as methods with fixed output. Furthermore, we directly compare
+state-of-the-art cascaded as well as end-to-end systems. Finally, the framework
+allows to automatically evaluate the translation quality as well as latency and
+also provides a web interface to show the low-latency model outputs to the
+user.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine
+  Lexicon-based Retriever <span class="chip">NLPCC2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijue Huang, Bingbing Wang, Libo Qin, Qin Zhao, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot and zero-shot entity linking focus on the tail and emerging
+entities, which are more challenging but closer to real-world scenarios. The
+mainstream method is the ''retrieve and rerank'' two-stage framework. In this
+paper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity
+candidates in an effective manner, which operates in two layers. The first
+layer retrieves coarse-grained candidates by leveraging entity names, while the
+second layer narrows down the search to fine-grained candidates within the
+coarse-grained ones. In addition, this second layer utilizes entity
+descriptions to effectively disambiguate tail or new entities that share names
+with existing popular entities. Experimental results indicate that our approach
+can obtain superior performance without requiring extensive finetuning in the
+retrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task
+6 on Chinese Few-shot and Zero-shot Entity Linking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NLPCC2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coupling Symbolic Reasoning with Language Modeling for Efficient
+  Longitudinal Understanding of Unstructured Electronic Medical Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivani Shekhar, Simran Tiwari, T. C. Rensink, Ramy Eskander, Wael Salloum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of Artificial Intelligence (AI) in healthcare has been
+revolutionary, especially with the recent advancements in transformer-based
+Large Language Models (LLMs). However, the task of understanding unstructured
+electronic medical records remains a challenge given the nature of the records
+(e.g., disorganization, inconsistency, and redundancy) and the inability of
+LLMs to derive reasoning paradigms that allow for comprehensive understanding
+of medical variables. In this work, we examine the power of coupling symbolic
+reasoning with language modeling toward improved understanding of unstructured
+clinical texts. We show that such a combination improves the extraction of
+several medical variables from unstructured records. In addition, we show that
+the state-of-the-art commercially-free LLMs enjoy retrieval capabilities
+comparable to those provided by their commercial counterparts. Finally, we
+elaborate on the need for LLM steering through the application of symbolic
+reasoning as the exclusive use of LLMs results in the lowest performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering
+  <span class="highlight-title">Dataset</span> for Scientific Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhi Li, Nima Tajbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present SciGraphQA, a synthetic multi-turn question-answer
+dataset related to academic graphs. SciGraphQA is 13 times larger than
+ChartVQA, the previously largest chart-visual question-answering dataset. It is
+also the largest open-sourced chart VQA dataset with non-synthetic charts. To
+build our dataset, we selected 290,000 Computer Science or Machine Learning
+ArXiv papers published between 2010 and 2020, and then used Palm-2 to generate
+295K samples of open-vocabulary multi-turn question-answering dialogues about
+the graphs. As context, we provided the text-only Palm-2 with paper title,
+abstract, paragraph mentioning the graph, and rich text contextual data from
+the graph itself, obtaining dialogues with an average 2.23 question-answer
+turns for each graph. We asked GPT-4 to assess the matching quality of our
+question-answer turns given the paper's context, obtaining an average rating of
+8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most
+popular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our
+dataset, finding LLaVA-13B being the most performant with a CIDEr score of
+0.08. We further enriched the question prompts for LLAVA by including the
+serialized data tables extracted from the graphs using the DePlot model,
+boosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,
+we also fine-tuned LLaVa using our dataset, reaching a substantially higher
+CIDEr score of 0.26. We anticipate further accuracy improvement by including
+segmentation mask tokens and leveraging larger LLM backbones coupled with
+emergent prompting techniques. Our code and data are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossTalk: Enhancing Communication and Collaboration in
+  Videoconferencing with Intent Recognition from Conversational Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haijun Xia, Tony Wang, Aditya Gunturu, Peiling Jiang, William Duan, Xiaoshuo Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances and ubiquity of digital communication media such as
+videoconferencing and virtual reality, they remain oblivious to the rich
+intentions expressed by users. Beyond transmitting audio, videos, and messages,
+we envision digital communication media as proactive facilitators that can
+provide unobtrusive assistance to enhance communication and collaboration.
+Informed by the results of a formative study, we propose three key design
+concepts to explore the systematic integration of intelligence into
+communication and collaboration, including the panel substrate, language-based
+intent recognition, and lightweight interaction techniques. We developed
+CrossTalk, a videoconferencing system that instantiates these concepts, which
+was found to enable a more fluid and flexible communication and collaboration
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models
+  Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longteng Zhang, Lin Zhang, Shaohuai Shi, Xiaowen Chu, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The low-rank adaptation (LoRA) method can largely reduce the amount of
+trainable parameters for fine-tuning large language models (LLMs), however, it
+still requires expensive activation memory to update low-rank weights. Reducing
+the number of LoRA layers or using activation recomputation could harm the
+fine-tuning performance or increase the computational overhead. In this work,
+we present LoRA-FA, a memory-efficient fine-tuning method that reduces the
+activation memory without performance degradation and expensive recomputation.
+LoRA-FA chooses to freeze the projection-down weight of $A$ and update the
+projection-up weight of $B$ in each LoRA layer. It ensures the change of model
+weight reside in a low-rank space during LLMs fine-tuning, while eliminating
+the requirement to store full-rank input activations. We conduct extensive
+experiments across multiple model types (RoBERTa, T5, LLaMA) and model scales.
+Our results show that LoRA-FA can always achieve close fine-tuning accuracy
+across different tasks compared to full parameter fine-tuning and LoRA.
+Furthermore, LoRA-FA can reduce the overall memory cost by up to 1.4$\times$
+compared to LoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Large Language Model Generalization with Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Grosse, Juhan Bae, Cem Anil, Nelson Elhage, Alex Tamkin, Amirhossein Tajdini, Benoit Steiner, Dustin Li, Esin Durmus, Ethan Perez, Evan Hubinger, Kamilė Lukošiūtė, Karina Nguyen, Nicholas Joseph, Sam McCandlish, Jared Kaplan, Samuel R. Bowman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trying to gain better visibility into a machine learning model in order
+to understand and mitigate the associated risks, a potentially valuable source
+of evidence is: which training examples most contribute to a given behavior?
+Influence functions aim to answer a counterfactual: how would the model's
+parameters (and hence its outputs) change if a given sequence were added to the
+training set? While influence functions have produced insights for small
+models, they are difficult to scale to large language models (LLMs) due to the
+difficulty of computing an inverse-Hessian-vector product (IHVP). We use the
+Eigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)
+approximation to scale influence functions up to LLMs with up to 52 billion
+parameters. In our experiments, EK-FAC achieves similar accuracy to traditional
+influence function estimators despite the IHVP computation being orders of
+magnitude faster. We investigate two algorithmic techniques to reduce the cost
+of computing gradients of candidate training sequences: TF-IDF filtering and
+query batching. We use influence functions to investigate the generalization
+patterns of LLMs, including the sparsity of the influence patterns, increasing
+abstraction with scale, math and programming abilities, cross-lingual
+generalization, and role-playing behavior. Despite many apparently
+sophisticated forms of generalization, we identify a surprising limitation:
+influences decay to near-zero when the order of key phrases is flipped.
+Overall, influence functions give us a powerful new tool for studying the
+generalization properties of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>119 pages, 47 figures, 22 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialogue Systems Can Generate Appropriate Responses without the Use of
+  Question Marks? -- Investigation of the Effects of Question Marks on Dialogue
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomoya Mizumoto, Takato Yamazaki, Katsumasa Yoshikawa, Masaya Ohagi, Toshiki Kawamoto, Toshinori Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When individuals engage in spoken discourse, various phenomena can be
+observed that differ from those that are apparent in text-based conversation.
+While written communication commonly uses a question mark to denote a query, in
+spoken discourse, queries are frequently indicated by a rising intonation at
+the end of a sentence. However, numerous speech recognition engines do not
+append a question mark to recognized queries, presenting a challenge when
+creating a spoken dialogue system. Specifically, the absence of a question mark
+at the end of a sentence can impede the generation of appropriate responses to
+queries in spoken dialogue systems. Hence, we investigate the impact of
+question marks on dialogue systems, with the results showing that they have a
+significant impact. Moreover, we analyze specific examples in an effort to
+determine which types of utterances have the impact on dialogue systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynJax: Structured Probability Distributions for JAX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Stanojević, Laurent Sartran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of deep learning software libraries enabled significant
+progress in the field by allowing users to focus on modeling, while letting the
+library to take care of the tedious and time-consuming task of optimizing
+execution for modern hardware accelerators. However, this has benefited only
+particular types of deep learning models, such as Transformers, whose
+primitives map easily to the vectorized computation. The models that explicitly
+account for structured objects, such as trees and segmentations, did not
+benefit equally because they require custom algorithms that are difficult to
+implement in a vectorized form.
+  SynJax directly addresses this problem by providing an efficient vectorized
+implementation of inference algorithms for structured distributions covering
+alignment, tagging, segmentation, constituency trees and spanning trees. With
+SynJax we can build large-scale differentiable models that explicitly model
+structure in the data. The code is available at
+https://github.com/deepmind/synjax.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards General Text Embeddings with Multi-stage Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, Meishan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GTE, a general-purpose text embedding model trained with
+multi-stage contrastive learning. In line with recent advancements in unifying
+various NLP tasks into a single format, we train a unified text embedding model
+by employing contrastive learning over a diverse mixture of datasets from
+multiple sources. By significantly increasing the number of training data
+during both unsupervised pre-training and supervised fine-tuning stages, we
+achieve substantial performance gains over existing embedding models. Notably,
+even with a relatively modest parameter count of 110M, GTE$_\text{base}$
+outperforms the black-box embedding API provided by OpenAI and even surpasses
+10x larger text embedding models on the massive text embedding benchmark.
+Furthermore, without additional fine-tuning on each programming language
+individually, our model outperforms previous best code retrievers of similar
+size by treating code as text. In summary, our model achieves impressive
+results by effectively harnessing multi-stage contrastive learning, offering a
+powerful and efficient text embedding model with broad applicability across
+various NLP and code-related tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniversalNER: Targeted Distillation from Large Language Models for Open
+  Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Zhou, Sheng Zhang, Yu Gu, Muhao Chen, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable generalizability,
+such as understanding arbitrary entities and relations. Instruction tuning has
+proven effective for distilling LLMs into more cost-efficient models such as
+Alpaca and Vicuna. Yet such student models still trail the original LLMs by
+large margins in downstream applications. In this paper, we explore targeted
+distillation with mission-focused instruction tuning to train student models
+that can excel in a broad application class such as open information
+extraction. Using named entity recognition (NER) for case study, we show how
+ChatGPT can be distilled into much smaller UniversalNER models for open NER.
+For evaluation, we assemble the largest NER benchmark to date, comprising 43
+datasets across 9 diverse domains such as biomedicine, programming, social
+media, law, finance. Without using any direct supervision, UniversalNER attains
+remarkable NER accuracy across tens of thousands of entity types, outperforming
+general instruction-tuned models such as Alpaca and Vicuna by over 30 absolute
+F1 points in average. With a tiny fraction of parameters, UniversalNER not only
+acquires ChatGPT's capability in recognizing arbitrary entity types, but also
+outperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,
+UniversalNER even outperforms by a large margin state-of-the-art multi-task
+instruction-tuned systems such as InstructUIE, which uses supervised NER
+examples. We also conduct thorough ablation studies to assess the impact of
+various components in our distillation approach. We will release the
+distillation recipe, data, and UniversalNER models to facilitate future
+research on targeted distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://universal-ner.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Ambiguity to Explicitness: NLP-Assisted 5G Specification
+  Abstraction for Formal Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Yuan, Jingda Yang, Sudhanshu Arya, Carlo Lipizzi, Ying Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Formal method-based analysis of the 5G Wireless Communication Protocol is
+crucial for identifying logical vulnerabilities and facilitating an
+all-encompassing security assessment, especially in the design phase. Natural
+Language Processing (NLP) assisted techniques and most of the tools are not
+widely adopted by the industry and research community. Traditional formal
+verification through a mathematics approach heavily relied on manual logical
+abstraction prone to being time-consuming, and error-prone. The reason that the
+NLP-assisted method did not apply in industrial research may be due to the
+ambiguity in the natural language of the protocol designs nature is
+controversial to the explicitness of formal verification. To address the
+challenge of adopting the formal methods in protocol designs, targeting (3GPP)
+protocols that are written in natural language, in this study, we propose a
+hybrid approach to streamline the analysis of protocols. We introduce a
+two-step pipeline that first uses NLP tools to construct data and then uses
+constructed data to extract identifiers and formal properties by using the NLP
+model. The identifiers and formal properties are further used for formal
+analysis. We implemented three models that take different dependencies between
+identifiers and formal properties as criteria. Our results of the optimal model
+reach valid accuracy of 39% for identifier extraction and 42% for formal
+properties predictions. Our work is proof of concept for an efficient procedure
+in performing formal analysis for largescale complicate specification and
+protocol analysis, especially for 5G and nextG communications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapter-based Selective Knowledge Distillation for Federated
+  Multi-domain Meeting Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiachong Feng, Xiaocheng Feng, Xiyuan Du, Min-Yen Kan, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meeting summarization has emerged as a promising technique for providing
+users with condensed summaries. However, existing work has focused on training
+models on centralized data, neglecting real-world scenarios where meeting data
+are infeasible to collect centrally, due to their sensitive nature. This gap
+motivates us to explore federated learning for meeting summarization. Two
+critical challenges impede progress. First, state-of-the-art summarizers are
+based on parameter-heavy pre-trained models. Exchanging such a model's
+parameters across clients imposes large bandwidth costs. Second, as real-world
+meeting data belong to various domains and are distributed across clients, they
+are instances of non-identically and independently distributed (non-IID). IID
+assumptions do not hold, which changes which forms of learning algorithms best
+apply. To address this, we propose Adapter-based Federated Selective Knowledge
+Distillation (AdaFedSelecKD) for training performant client models.
+Specifically, we develop an adapter-based summarization model where two
+adapters cooperatively facilitate learning using fewer parameters to reduce
+communication costs. Then, we devise a selective knowledge distillation
+strategy, assisting clients in robustly handling domain-focused modelling on
+their own data, while leveraging global parameters based on non-IID data.
+Extensive experiments on the QMSum benchmark demonstrate AdaFedSelecKD can
+achieve comparable performance with powerful centralized training methods, and
+shows its generalizability and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE TASLP for possible
+  publication. Copyright may be transferred without notice, after which this
+  version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Rule Injection for ComplEx Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodi Ma, Anthony Colas, Yuejie Wang, Ali Sadeghian, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in neural knowledge graph inference attempt to combine logic
+rules with knowledge graph embeddings to benefit from prior knowledge. However,
+they usually cannot avoid rule grounding, and injecting a diverse set of rules
+has still not been thoroughly explored. In this work, we propose InjEx, a
+mechanism to inject multiple types of rules through simple constraints, which
+capture definite Horn rules. To start, we theoretically prove that InjEx can
+inject such rules. Next, to demonstrate that InjEx infuses interpretable prior
+knowledge into the embedding space, we evaluate InjEx on both the knowledge
+graph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.
+Our experimental results reveal that InjEx outperforms both baseline KGC models
+as well as specialized few-shot models while maintaining its scalability and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and
+  Effective Hotword Customization Ability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Shi, Yexin Yang, Zerui Li, Shiliang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hotword customization is one of the important issues remained in ASR field -
+it is of value to enable users of ASR systems to customize names of entities,
+persons and other phrases. The past few years have seen both implicit and
+explicit modeling strategies for ASR contextualization developed. While these
+approaches have performed adequately, they still exhibit certain shortcomings,
+such as instability in effectiveness, especially in non-autoregressive ASR
+models. In this paper we propose Semantic-augmented Contextual-Paraformer
+(SeACo-Paraformer) a novel NAR based ASR system with flexible and effective
+hotword customization ability. It combines the accuracy of the AED-based model,
+the efficiency of the NAR model, and the excellent performance in
+contextualization. In tens of thousands of hours industrial big data
+experiments, our proposed model outperforms strong baselines in customization
+and general ASR tasks. Besides, we explore an efficient way to filter large
+scale incoming hotwords for further improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>early draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaniniQA: Enhancing Patient Education Through Interactive Question
+  Answering <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengshan Cai, Zonghai Yao, Fei Liu, Dakuo Wang, Meghan Reilly, Huixue Zhou, Lingxi Li, Yi Cao, Alok Kapoor, Adarsha Bajracharya, Dan Berlowitz, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Patient portal allows discharged patients to access their personalized
+discharge instructions in electronic health records (EHRs). However, many
+patients have difficulty understanding or memorizing their discharge
+instructions. In this paper, we present PaniniQA, a patient-centric interactive
+question answering system designed to help patients understand their discharge
+instructions. PaniniQA first identifies important clinical content from
+patients' discharge instructions and then formulates patient-specific
+educational questions. In addition, PaniniQA is also equipped with answer
+verification functionality to provide timely feedback to correct patients'
+misunderstandings. Our comprehensive automatic and human evaluation results
+demonstrate our PaniniQA is capable of improving patients' mastery of their
+medical instructions through effective interactions
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TACL 2023. This arXiv version is a pre-MIT Press
+  publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of the Evolution of Advanced <span class="highlight-title">Transformer</span>-Based Language Models:
+  Experiments on Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour Eddine Zekaoui, Siham Yousfi, Maryem Rhanoui, Mounia Mikram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opinion mining, also known as sentiment analysis, is a subfield of natural
+language processing (NLP) that focuses on identifying and extracting subjective
+information in textual material. This can include determining the overall
+sentiment of a piece of text (e.g., positive or negative), as well as
+identifying specific emotions or opinions expressed in the text, that involves
+the use of advanced machine and deep learning techniques. Recently,
+transformer-based language models make this task of human emotion analysis
+intuitive, thanks to the attention mechanism and parallel computation. These
+advantages make such models very powerful on linguistic tasks, unlike recurrent
+neural networks that spend a lot of time on sequential processing, making them
+prone to fail when it comes to processing long text. The scope of our paper
+aims to study the behaviour of the cutting-edge Transformer-based language
+models on opinion mining and provide a high-level comparison between them to
+highlight their key particularities. Additionally, our comparative study shows
+leads and paves the way for production engineers regarding the approach to
+focus on and is useful for researchers as it provides guidelines for future
+research subjects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Automated Distractor and Feedback Generation for Math
+  Multiple-choice Questions via In-context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hunter McNichols, Wanyong Feng, Jaewook Lee, Alexander Scarlatos, Digory Smith, Simon Woodhead, Andrew Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple-choice questions (MCQs) are ubiquitous in almost all levels of
+education since they are easy to administer, grade, and are a reliable format
+in both assessments and practices. An important aspect of MCQs is the
+distractors, i.e., incorrect options that are designed to target specific
+misconceptions or insufficient knowledge among students. To date, the task of
+crafting high-quality distractors has largely remained a labor-intensive
+process for teachers and learning content designers, which has limited
+scalability. In this work, we explore the task of automated distractor and
+corresponding feedback message generation in math MCQs using large language
+models. We establish a formulation of these two tasks and propose a simple,
+in-context learning-based solution. Moreover, we explore using two non-standard
+metrics to evaluate the quality of the generated distractors and feedback
+messages. We conduct extensive experiments on these tasks using a real-world
+MCQ dataset that contains student response information. Our findings suggest
+that there is a lot of room for improvement in automated distractor and
+feedback generation. We also outline several directions for future work
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple synthetic data reduces sycophancy in large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Wei, Da Huang, Yifeng Lu, Denny Zhou, Quoc V. Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sycophancy is an undesirable behavior where models tailor their responses to
+follow a human user's view even when that view is not objectively correct
+(e.g., adapting liberal views once a user reveals that they are liberal). In
+this paper, we study the prevalence of sycophancy in language models and
+propose a simple synthetic-data intervention to reduce this behavior.
+  First, on a set of three sycophancy tasks (Perez et al., 2022) where models
+are asked for an opinion on statements with no correct answers (e.g.,
+politics), we observe that both model scaling and instruction tuning
+significantly increase sycophancy for PaLM models up to 540B parameters.
+Second, we extend sycophancy evaluations to simple addition statements that are
+objectively incorrect, finding that despite knowing that these statements are
+wrong, language models will still agree with them if the user does as well.
+  To reduce sycophancy, we present a straightforward synthetic-data
+intervention that takes public NLP tasks and encourages models to be robust to
+user opinions on these tasks. Adding these data in a lightweight finetuning
+step can significantly reduce sycophantic behavior on held-out prompts. Code
+for generating synthetic data for intervention can be found at
+https://github.com/google/sycophancy-intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Establishing Trust in Chat<span class="highlight-title">GPT</span> BioMedical Generated Text: An
+  Ontology-Based Knowledge Graph to Validate Disease-Symptom Links 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Abdeen Hamed, Alessandro Crimi, Magdalena M. Misiak, Byung Suk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods: Through an innovative approach, we construct ontology-based
+knowledge graphs from authentic medical literature and AI-generated content.
+Our goal is to distinguish factual information from unverified data. We
+compiled two datasets: one from biomedical literature using a "human disease
+and symptoms" query, and another generated by ChatGPT, simulating articles.
+With these datasets (PubMed and ChatGPT), we curated 10 sets of 250 abstracts
+each, selected randomly with a specific seed. Our method focuses on utilizing
+disease ontology (DOID) and symptom ontology (SYMP) to build knowledge graphs,
+robust mathematical models that facilitate unbiased comparisons. By employing
+our fact-checking algorithms and network centrality metrics, we conducted GPT
+disease-symptoms link analysis to quantify the accuracy of factual knowledge
+amid noise, hypotheses, and significant findings.
+  Results: The findings obtained from the comparison of diverse ChatGPT
+knowledge graphs with their PubMed counterparts revealed some interesting
+observations. While PubMed knowledge graphs exhibit a wealth of disease-symptom
+terms, it is surprising to observe that some ChatGPT graphs surpass them in the
+number of connections. Furthermore, some GPT graphs are demonstrating supremacy
+of the centrality scores, especially for the overlapping nodes. This striking
+contrast indicates the untapped potential of knowledge that can be derived from
+AI-generated content, awaiting verification. Out of all the graphs, the factual
+link ratio between any two graphs reached its peak at 60%.
+  Conclusions: An intriguing insight from our findings was the striking number
+of links among terms in the knowledge graph generated from ChatGPT datasets,
+surpassing some of those in its PubMed counterpart. This early discovery has
+prompted further investigation using universal network metrics to unveil the
+new knowledge the links may hold.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, 3 algorithms, 4 tables, and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Automatic Phonetic Transcription into the International
+  Phonetic Alphabet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chihiro Taguchi, Yusuke Sakai, Parisa Haghani, David Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a state-of-the-art model for transcribing speech in any
+language into the International Phonetic Alphabet (IPA). Transcription of
+spoken languages into IPA is an essential yet time-consuming process in
+language documentation, and even partially automating this process has the
+potential to drastically speed up the documentation of endangered languages.
+Like the previous best speech-to-IPA model (Wav2Vec2Phoneme), our model is
+based on wav2vec 2.0 and is fine-tuned to predict IPA from audio input. We use
+training data from seven languages from CommonVoice 11.0, transcribed into IPA
+semi-automatically. Although this training dataset is much smaller than
+Wav2Vec2Phoneme's, its higher quality lets our model achieve comparable or
+better results. Furthermore, we show that the quality of our universal
+speech-to-IPA models is close to that of human annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Assistant Language Understanding On Device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aas, Hisham Abdelsalam, Irina Belousova, Shruti Bhargava, Jianpeng Cheng, Robert Daland, Joris Driesen, Federico Flego, Tristan Guigue, Anders Johannsen, Partha Lal, Jiarui Lu, Joel Ruben Antony Moniz, Nathan Perkins, Dhivya Piraviperumal, Stephen Pulman, Diarmuid Ó Séaghdha, David Q. Sun, John Torr, Marco Del Vecchio, Jay Wacker, Jason D. Williams, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has recently become feasible to run personal digital assistants on phones
+and other personal devices. In this paper we describe a design for a natural
+language understanding system that runs on device. In comparison to a
+server-based assistant, this system is more private, more reliable, faster,
+more expressive, and more accurate. We describe what led to key choices about
+architecture and technologies. For example, some approaches in the dialog
+systems literature are difficult to maintain over time in a deployment setting.
+We hope that sharing learnings from our practical experiences may help inform
+future work in the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Cross-Domain Evaluation of Approaches for Causal Knowledge Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anik Saha, Oktie Hassanzadeh, Alex Gittens, Jian Ni, Kavitha Srinivas, Bulent Yener
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal knowledge extraction is the task of extracting relevant causes and
+effects from text by detecting the causal relation. Although this task is
+important for language understanding and knowledge discovery, recent works in
+this domain have largely focused on binary classification of a text segment as
+causal or non-causal. In this regard, we perform a thorough analysis of three
+sequence tagging models for causal knowledge extraction and compare it with a
+span based approach to causality extraction. Our experiments show that
+embeddings from pre-trained language models (e.g. BERT) provide a significant
+performance boost on this task compared to previous state-of-the-art models
+with complex architectures. We observe that span based models perform better
+than simple sequence tagging models based on BERT across all 4 data sets from
+diverse domains with different types of cause-effect phrases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Benchmark Creation for Table Union Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koyena Pal, Aamod Khatiwada, Roee Shraga, Renée J. Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data management has traditionally relied on synthetic data generators to
+generate structured benchmarks, like the TPC suite, where we can control
+important parameters like data size and its distribution precisely. These
+benchmarks were central to the success and adoption of database management
+systems. But more and more, data management problems are of a semantic nature.
+An important example is finding tables that can be unioned. While any two
+tables with the same cardinality can be unioned, table union search is the
+problem of finding tables whose union is semantically coherent. Semantic
+problems cannot be benchmarked using synthetic data. Our current methods for
+creating benchmarks involve the manual curation and labeling of real data.
+These methods are not robust or scalable and perhaps more importantly, it is
+not clear how robust the created benchmarks are. We propose to use generative
+AI models to create structured data benchmarks for table union search. We
+present a novel method for using generative models to create tables with
+specified properties. Using this method, we create a new benchmark containing
+pairs of tables that are both unionable and non-unionable but related. We
+thoroughly evaluate recent existing table union search methods over existing
+benchmarks and our new benchmark. We also present and evaluate a new table
+search methods based on recent large language models over all benchmarks. We
+show that the new benchmark is more challenging for all methods than
+hand-curated benchmarks, specifically, the top-performing method achieves a
+Mean Average Precision of around 60%, over 30% less than its performance on
+existing manually created benchmarks. We examine why this is the case and show
+that the new benchmark permits more detailed analysis of methods, including a
+study of both false positives and false negatives that were not possible with
+existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trusting Language Models in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jogi Suda Neto, Li Deng, Thejaswi Raya, Reza Shahbazi, Nick Liu, Adhitya Venkatesh, Miral Shah, Neeru Khosla, Rodrigo Capobianco Guido
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models are being widely used in Education. Even though modern deep
+learning models achieve very good performance on question-answering tasks,
+sometimes they make errors. To avoid misleading students by showing wrong
+answers, it is important to calibrate the confidence - that is, the prediction
+probability - of these models. In our work, we propose to use an XGBoost on top
+of BERT to output the corrected probabilities, using features based on the
+attention mechanism. Our hypothesis is that the level of uncertainty contained
+in the flow of attention is related to the quality of the model's response
+itself.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Storyfier: Exploring Vocabulary Learning Support with Text Generation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhui Peng, Xingbo Wang, Qiushi Han, Junkai Zhu, Xiaojuan Ma, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vocabulary learning support tools have widely exploited existing materials,
+e.g., stories or video clips, as contexts to help users memorize each target
+word. However, these tools could not provide a coherent context for any target
+words of learners' interests, and they seldom help practice word usage. In this
+paper, we work with teachers and students to iteratively develop Storyfier,
+which leverages text generation models to enable learners to read a generated
+story that covers any target words, conduct a story cloze test, and use these
+words to write a new story with adaptive AI assistance. Our within-subjects
+study (N=28) shows that learners generally favor the generated stories for
+connecting target words and writing assistance for easing their learning
+workload. However, in the read-cloze-write learning sessions, participants
+using Storyfier perform worse in recalling and using target words than learning
+with a baseline tool without our AI features. We discuss insights into
+supporting learning tasks with generative models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 2023 ACM Symposium on User Interface Software and
+  Technology (UIST); 16 pages (7 figures, 23 tables)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting detailed oncologic history and treatment plan from medical
+  oncology notes with large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhumita Sushil, Vanessa E. Kennedy, Brenda Y. Miao, Divneet Mandair, Travis Zack, Atul J. Butte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Both medical care and observational studies in oncology require a thorough
+understanding of a patient's disease progression and treatment history, often
+elaborately documented in clinical notes. Despite their vital role, no current
+oncology information representation and annotation schema fully encapsulates
+the diversity of information recorded within these notes. Although large
+language models (LLMs) have recently exhibited impressive performance on
+various medical natural language processing tasks, due to the current lack of
+comprehensively annotated oncology datasets, an extensive evaluation of LLMs in
+extracting and reasoning with the complex rhetoric in oncology notes remains
+understudied. We developed a detailed schema for annotating textual oncology
+information, encompassing patient characteristics, tumor characteristics,
+tests, treatments, and temporality. Using a corpus of 10 de-identified breast
+cancer progress notes at University of California, San Francisco, we applied
+this schema to assess the abilities of three recently-released LLMs (GPT-4,
+GPT-3.5-turbo, and FLAN-UL2) to perform zero-shot extraction of detailed
+oncological history from two narrative sections of clinical progress notes. Our
+team annotated 2750 entities, 2874 modifiers, and 1623 relationships. The GPT-4
+model exhibited overall best performance, with an average BLEU score of 0.69,
+an average ROUGE score of 0.72, and an average accuracy of 67% on complex tasks
+(expert manual evaluation). Notably, it was proficient in tumor characteristic
+and medication extraction, and demonstrated superior performance in inferring
+symptoms due to cancer and considerations of future medications. The analysis
+demonstrates that GPT-4 is potentially already usable to extract important
+facts from cancer progress notes needed for clinical research, complex
+population management, and documenting quality patient care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code available at:
+  https://github.com/MadhumitaSushil/OncLLMExtraction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossTalk: Intelligent Substrates for Language-Oriented Interaction in
+  Video-Based Communication and Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haijun Xia, Tony Wang, Aditya Gunturu, Peiling Jiang, William Duan, Xiaoshuo Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances and ubiquity of digital communication media such as
+videoconferencing and virtual reality, they remain oblivious to the rich
+intentions expressed by users. Beyond transmitting audio, videos, and messages,
+we envision digital communication media as proactive facilitators that can
+provide unobtrusive assistance to enhance communication and collaboration.
+Informed by the results of a formative study, we propose three key design
+concepts to explore the systematic integration of intelligence into
+communication and collaboration, including the panel substrate, language-based
+intent recognition, and lightweight interaction techniques. We developed
+CrossTalk, a videoconferencing system that instantiates these concepts, which
+was found to enable a more fluid and flexible communication and collaboration
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Explanations: Leveraging Human Input to Align Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09656v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09656v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivian Lai, Yiming Zhang, Chacha Chen, Q. Vera Liao, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a vast collection of explainable AI (XAI) algorithms have been
+developed in recent years, they are often criticized for significant gaps with
+how humans produce and consume explanations. As a result, current XAI
+techniques are often found to be hard to use and lack effectiveness. In this
+work, we attempt to close these gaps by making AI explanations selective -- a
+fundamental property of human explanations -- by selectively presenting a
+subset from a large set of model reasons based on what aligns with the
+recipient's preferences. We propose a general framework for generating
+selective explanations by leveraging human input on a small sample. This
+framework opens up a rich design space that accounts for different selectivity
+goals, types of input, and more. As a showcase, we use a decision-support task
+to explore selective explanations based on what the decision-maker would
+consider relevant to the decision task. We conducted two experimental studies
+to examine three out of a broader possible set of paradigms based on our
+proposed framework: in Study 1, we ask the participants to provide their own
+input to generate selective explanations, with either open-ended or
+critique-based input. In Study 2, we show participants selective explanations
+based on input from a panel of similar users (annotators). Our experiments
+demonstrate the promise of selective explanations in reducing over-reliance on
+AI and improving decision outcomes and subjective perceptions of the AI, but
+also paint a nuanced picture that attributes some of these positive effects to
+the opportunity to provide one's own input to augment AI explanations. Overall,
+our work proposes a novel XAI framework inspired by human communication
+behaviors and demonstrates its potentials to encourage future work to better
+align AI explanations with human production and consumption of explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer
+  using LSTM, BiLSTM, CNN, GRU, and GloVe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham, Jamil Al Shaqsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and
+GloVe to classify gene mutations using Kaggle's Personalized Medicine:
+Redefining Cancer Treatment dataset. The results were compared against
+well-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and
+their LSTM ensembles. Our model outperformed all other models in terms of
+accuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it
+also needed less training time, resulting in a perfect combination of
+performance and efficiency. This study demonstrates the utility of ensemble
+models for difficult tasks such as gene mutation classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Getting pwn'd by AI: Penetration Testing with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00121v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00121v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Happe, Jürgen Cito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of software security testing, more specifically penetration
+testing, is an activity that requires high levels of expertise and involves
+many manual testing and analysis steps. This paper explores the potential usage
+of large-language models, such as GPT3.5, to augment penetration testers with
+AI sparring partners. We explore the feasibility of supplementing penetration
+testers with AI models for two distinct use cases: high-level task planning for
+security testing assignments and low-level vulnerability hunting within a
+vulnerable virtual machine. For the latter, we implemented a closed-feedback
+loop between LLM-generated low-level actions with a vulnerable virtual machine
+(connected through SSH) and allowed the LLM to analyze the machine state for
+vulnerabilities and suggest concrete attack vectors which were automatically
+executed within the virtual machine. We discuss promising initial results,
+detail avenues for improvement, and close deliberating on the ethics of
+providing AI-based sparring partners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, vision paper FSE'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Design of Semantic Similarity Ensembles Using Grammatical
+  Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00925v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00925v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic similarity measures are widely used in natural language processing
+to catalyze various computer-related tasks. However, no single semantic
+similarity measure is the most appropriate for all tasks, and researchers often
+use ensemble strategies to ensure performance. This research work proposes a
+method for automatically designing semantic similarity ensembles. In fact, our
+proposed method uses grammatical evolution, for the first time, to
+automatically select and aggregate measures from a pool of candidates to create
+an ensemble that maximizes correlation to human judgment. The method is
+evaluated on several benchmark datasets and compared to state-of-the-art
+ensembles, showing that it can significantly improve similarity assessment
+accuracy and outperform existing methods in some cases. As a result, our
+research demonstrates the potential of using grammatical evolution to
+automatically compare text and prove the benefits of using ensembles for
+semantic similarity tasks. The source code that illustrates our approach can be
+downloaded from https://github.com/jorge-martinez-gil/sesige.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QAmeleon: Multilingual QA with Only 5 Examples <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priyanka Agrawal, Chris Alberti, Fantine Huot, Joshua Maynez, Ji Ma, Sebastian Ruder, Kuzman Ganchev, Dipanjan Das, Mirella Lapata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of large, high-quality datasets has been one of the main
+drivers of recent progress in question answering (QA). Such annotated datasets
+however are difficult and costly to collect, and rarely exist in languages
+other than English, rendering QA technology inaccessible to underrepresented
+languages. An alternative to building large monolingual training datasets is to
+leverage pre-trained language models (PLMs) under a few-shot learning setting.
+Our approach, QAmeleon, uses a PLM to automatically generate multilingual data
+upon which QA models are trained, thus avoiding costly annotation. Prompt
+tuning the PLM for data synthesis with only five examples per language delivers
+accuracy superior to translation-based baselines, bridges nearly 60% of the gap
+between an English-only baseline and a fully supervised upper bound trained on
+almost 50,000 hand labeled examples, and always leads to substantial
+improvements compared to fine-tuning a QA model directly on labeled examples in
+low resource settings. Experiments on the TyDiQA-GoldP and MLQA benchmarks show
+that few-shot prompt tuning for data synthesis scales across languages and is a
+viable alternative to large-scale annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear at Transactions of Association for Computational
+  Linguistics (TACL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TikTalk: A Video-Based Dialogue <span class="highlight-title">Dataset</span> for Multi-Modal Chitchat in Real
+  World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongpeng Lin, Ludan Ruan, Wenke Xia, Peiyu Liu, Jingyuan Wen, Yixin Xu, Di Hu, Ruihua Song, Wayne Xin Zhao, Qin Jin, Zhiwu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate the research on intelligent and human-like chatbots with
+multi-modal context, we introduce a new video-based multi-modal dialogue
+dataset, called TikTalk. We collect 38K videos from a popular video-sharing
+platform, along with 367K conversations posted by users beneath them. Users
+engage in spontaneous conversations based on their multi-modal experiences from
+watching videos, which helps recreate real-world chitchat context. Compared to
+previous multi-modal dialogue datasets, the richer context types in TikTalk
+lead to more diverse conversations, but also increase the difficulty in
+capturing human interests from intricate multi-modal information to generate
+personalized responses. Moreover, external knowledge is more frequently evoked
+in our dataset. These facts reveal new challenges for multi-modal dialogue
+models. We quantitatively demonstrate the characteristics of TikTalk, propose a
+video-based multi-modal chitchat task, and evaluate several dialogue baselines.
+Experimental results indicate that the models incorporating large language
+models (LLM) can generate more diverse responses, while the model utilizing
+knowledge graphs to introduce external knowledge performs the best overall.
+Furthermore, no existing model can solve all the above challenges well. There
+is still a large room for future improvements, even for LLM with visual
+extensions. Our dataset is available at
+\url{https://ruc-aimind.github.io/projects/TikTalk/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Missing Information, Unresponsive Authors, Experimental Flaws: The
+  Impossibility of Assessing the Reproducibility of Previous Human Evaluations
+  in NLP <span class="chip">EACL2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01633v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01633v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anya Belz, Craig Thomson, Ehud Reiter, Gavin Abercrombie, Jose M. Alonso-Moral, Mohammad Arvan, Anouck Braggaar, Mark Cieliebak, Elizabeth Clark, Kees van Deemter, Tanvi Dinkar, Ondřej Dušek, Steffen Eger, Qixiang Fang, Mingqi Gao, Albert Gatt, Dimitra Gkatzia, Javier González-Corbelle, Dirk Hovy, Manuela Hürlimann, Takumi Ito, John D. Kelleher, Filip Klubicka, Emiel Krahmer, Huiyuan Lai, Chris van der Lee, Yiru Li, Saad Mahamood, Margot Mieskes, Emiel van Miltenburg, Pablo Mosteiro, Malvina Nissim, Natalie Parde, Ondřej Plátek, Verena Rieser, Jie Ruan, Joel Tetreault, Antonio Toral, Xiaojun Wan, Leo Wanner, Lewis Watson, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report our efforts in identifying a set of previous human evaluations in
+NLP that would be suitable for a coordinated study examining what makes human
+evaluations in NLP more/less reproducible. We present our results and findings,
+which include that just 13\% of papers had (i) sufficiently low barriers to
+reproduction, and (ii) enough obtainable information, to be considered for
+reproduction, and that all but one of the experiments we selected for
+reproduction was discovered to have flaws that made the meaningfulness of
+conducting a reproduction questionable. As a result, we had to change our
+coordinated study design from a reproduce approach to a
+standardise-then-reproduce-twice approach. Our overall (negative) finding that
+the great majority of human evaluations in NLP is not repeatable and/or not
+reproducible and/or too flawed to justify reproduction, paints a dire picture,
+but presents an opportunity for a rethink about how to design and report human
+evaluations in NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages plus appendix, 4 tables, 1 figure. To appear at "Workshop on
+  Insights from Negative Results in NLP" (co-located with EACL2023). Updated
+  author list and acknowledgements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One model to rule them all: ranking Slovene summarizers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleš Žagar, Marko Robnik-Šikonja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text summarization is an essential task in natural language processing, and
+researchers have developed various approaches over the years, ranging from
+rule-based systems to neural networks. However, there is no single model or
+approach that performs well on every type of text. We propose a system that
+recommends the most suitable summarization model for a given text. The proposed
+system employs a fully connected neural network that analyzes the input content
+and predicts which summarizer should score the best in terms of ROUGE score for
+a given input. The meta-model selects among four different summarization
+models, developed for the Slovene language, using different properties of the
+input, in particular its Doc2Vec document representation. The four Slovene
+summarization models deal with different challenges associated with text
+summarization in a less-resourced language. We evaluate the proposed SloMetaSum
+model performance automatically and parts of it manually. The results show that
+the system successfully automates the step of manually selecting the best
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ General Debiasing for Multimodal Sentiment Analysis <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Sun, Juntong Ni, Wenjie Wang, Liqiang Jing, Yinwei Wei, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal
+information for prediction yet unavoidably suffers from fitting the spurious
+correlations between multimodal features and sentiment labels. For example, if
+most videos with a blue background have positive labels in a dataset, the model
+will rely on such correlations for prediction, while "blue background" is not a
+sentiment-related feature. To address this problem, we define a general
+debiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)
+generalization ability of MSA models by reducing their reliance on spurious
+correlations. To this end, we propose a general debiasing framework based on
+Inverse Probability Weighting (IPW), which adaptively assigns small weights to
+the samples with larger bias (i.e., the severer spurious correlations). The key
+to this debiasing framework is to estimate the bias of each sample, which is
+achieved by two steps: 1) disentangling the robust features and biased features
+in each modality, and 2) utilizing the biased features to estimate the bias.
+Finally, we employ IPW to reduce the effects of large-biased samples,
+facilitating robust feature learning for sentiment prediction. To examine the
+model's generalization ability, we keep the original testing sets on two
+benchmarks and additionally construct multiple unimodal and multimodal OOD
+testing sets. The empirical results demonstrate the superior generalization
+ability of our proposed framework. We have released the code and data to
+facilitate the reproduction https://github.com/Teng-Sun/GEAR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SceneGATE: Scene-Graph based co-Attention networks for TExt visual
+  question answering <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08283v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08283v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiqi Cao, Siwen Luo, Felipe Nunez, Zean Wen, Josiah Poon, Caren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most TextVQA approaches focus on the integration of objects, scene texts and
+question words by a simple transformer encoder. But this fails to capture the
+semantic relations between different modalities. The paper proposes a Scene
+Graph based co-Attention Network (SceneGATE) for TextVQA, which reveals the
+semantic relations among the objects, Optical Character Recognition (OCR)
+tokens and the question words. It is achieved by a TextVQA-based scene graph
+that discovers the underlying semantics of an image. We created a
+guided-attention module to capture the intra-modal interplay between the
+language and the vision as a guidance for inter-modal interactions. To make
+explicit teaching of the relations between the two modalities, we proposed and
+integrated two attention modules, namely a scene graph-based semantic
+relation-aware attention and a positional relation-aware attention. We
+conducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.
+It is shown that our SceneGATE method outperformed existing ones because of the
+scene graph and its attention modules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Robotics (Q1, SCI indexed Journal):
+  https://www.mdpi.com/2218-6581/12/4/114</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Claim-Dissector: An Interpretable Fact-Checking System with Joint
+  Re-ranking and Veracity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.14116v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.14116v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Fajcik, Petr Motlicek, Pavel Smrz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Claim-Dissector: a novel latent variable model for fact-checking
+and analysis, which given a claim and a set of retrieved evidences jointly
+learns to identify: (i) the relevant evidences to the given claim, (ii) the
+veracity of the claim. We propose to disentangle the per-evidence relevance
+probability and its contribution to the final veracity probability in an
+interpretable way -- the final veracity probability is proportional to a linear
+ensemble of per-evidence relevance probabilities. In this way, the individual
+contributions of evidences towards the final predicted probability can be
+identified. In per-evidence relevance probability, our model can further
+distinguish whether each relevant evidence is supporting (S) or refuting (R)
+the claim. This allows to quantify how much the S/R probability contributes to
+the final verdict or to detect disagreeing evidence.
+  Despite its interpretable nature, our system achieves results competitive
+with state-of-the-art on the FEVER dataset, as compared to typical two-stage
+system pipelines, while using significantly fewer parameters. It also sets new
+state-of-the-art on FAVIQ and RealFC datasets. Furthermore, our analysis shows
+that our model can learn fine-grained relevance cues while using coarse-grained
+supervision, and we demonstrate it in 2 ways. (i) We show that our model can
+achieve competitive sentence recall while using only paragraph-level relevance
+supervision. (ii) Traversing towards the finest granularity of relevance, we
+show that our model is capable of identifying relevance at the token level. To
+do this, we present a new benchmark TLR-FEVER focusing on token-level
+interpretability -- humans annotate tokens in relevant evidences they
+considered essential when making their judgment. Then we measure how similar
+are these annotations to the tokens our model is focusing on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated acknowledgement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Rule-based Named Entity Recognition and Relation Extraction for
+  Process Model Generation from Natural Language Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Neuberger, Lars Ackermann, Stefan Jablonski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-aware information systems offer extensive advantages to companies,
+facilitating planning, operations, and optimization of day-to-day business
+activities. However, the time-consuming but required step of designing formal
+business process models often hampers the potential of these systems. To
+overcome this challenge, automated generation of business process models from
+natural language text has emerged as a promising approach to expedite this
+step. Generally two crucial subtasks have to be solved: extracting
+process-relevant information from natural language and creating the actual
+model. Approaches towards the first subtask are rule based methods, highly
+optimized for specific domains, but hard to adapt to related applications. To
+solve this issue, we present an extension to an existing pipeline, to make it
+entirely data driven. We demonstrate the competitiveness of our improved
+pipeline, which not only eliminates the substantial overhead associated with
+feature engineering and rule definition, but also enables adaptation to
+different datasets, entity and relation types, and new domains. Additionally,
+the largest available dataset (PET) for the first subtask, contains no
+information about linguistic references between mentions of entities in the
+process description. Yet, the resolution of these mentions into a single visual
+element is essential for high quality process models. We propose an extension
+to the PET dataset that incorporates information about linguistic references
+and a corresponding method for resolving them. Finally, we provide a detailed
+analysis of the inherent challenges in the dataset at hand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review for CoopIS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Membership Inference Attacks against Language Models via Neighbourhood
+  Comparison 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justus Mattern, Fatemehsadat Mireshghallah, Zhijing Jin, Bernhard Schölkopf, Mrinmaya Sachan, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Membership Inference attacks (MIAs) aim to predict whether a data sample was
+present in the training data of a machine learning model or not, and are widely
+used for assessing the privacy risks of language models. Most existing attacks
+rely on the observation that models tend to assign higher probabilities to
+their training samples than non-training points. However, simple thresholding
+of the model score in isolation tends to lead to high false-positive rates as
+it does not account for the intrinsic complexity of a sample. Recent work has
+demonstrated that reference-based attacks which compare model scores to those
+obtained from a reference model trained on similar data can substantially
+improve the performance of MIAs. However, in order to train reference models,
+attacks of this kind make the strong and arguably unrealistic assumption that
+an adversary has access to samples closely resembling the original training
+data. Therefore, we investigate their performance in more realistic scenarios
+and find that they are highly fragile in relation to the data distribution used
+to train reference models. To investigate whether this fragility provides a
+layer of safety, we propose and evaluate neighbourhood attacks, which compare
+model scores for a given sample to scores of synthetically generated neighbour
+texts and therefore eliminate the need for access to the training data
+distribution. We show that, in addition to being competitive with
+reference-based attacks that have perfect knowledge about the training data
+distribution, our attack clearly outperforms existing reference-free attacks as
+well as reference-based attacks with imperfect knowledge, which demonstrates
+the need for a reevaluation of the threat model of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAME: Confidence-guided Adaptive Memory Efficient Optimization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Luo, Xiaozhe Ren, Zangwei Zheng, Zhuo Jiang, Xin Jiang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive gradient methods, such as Adam and LAMB, have demonstrated excellent
+performance in the training of large language models. Nevertheless, the need
+for adaptivity requires maintaining second-moment estimates of the
+per-parameter gradients, which entails a high cost of extra memory overheads.
+To solve this problem, several memory-efficient optimizers (e.g., Adafactor)
+have been proposed to obtain a drastic reduction in auxiliary memory usage, but
+with a performance penalty. In this paper, we first study a confidence-guided
+strategy to reduce the instability of existing memory efficient optimizers.
+Based on this strategy, we propose CAME to simultaneously achieve two goals:
+fast convergence as in traditional adaptive methods, and low memory usage as in
+memory-efficient methods. Extensive experiments demonstrate the training
+stability and superior performance of CAME across various NLP tasks such as
+BERT and GPT-2 training. Notably, for BERT pre-training on the large batch size
+of 32,768, our proposed optimizer attains faster convergence and higher
+accuracy compared with the Adam optimizer. The implementation of CAME is
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Modification Adversarial Attacks for Natural Language Processing:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.00676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.00676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Roth, Yansong Gao, Alsharif Abuadbba, Surya Nepal, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are now many adversarial attacks for natural language processing
+systems. Of these, a vast majority achieve success by modifying individual
+document tokens, which we call here a token-modification attack. Each
+token-modification attack is defined by a specific combination of fundamental
+components, such as a constraint on the adversary or a particular search
+algorithm. Motivated by this observation, we survey existing token-modification
+attacks and extract the components of each. We use an attack-independent
+framework to structure our survey which results in an effective categorisation
+of the field and an easy comparison of components. This survey aims to guide
+new researchers to this field and spark further research into individual attack
+components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 2: updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending against Insertion-based Textual Backdoor Attacks via
+  Attribution <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhao Li, Zhuofeng Wu, Wei Ping, Chaowei Xiao, V. G. Vinod Vydiswaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual backdoor attack, as a novel attack model, has been shown to be
+effective in adding a backdoor to the model during training. Defending against
+such backdoor attacks has become urgent and important. In this paper, we
+propose AttDef, an efficient attribution-based pipeline to defend against two
+insertion-based poisoning attacks, BadNL and InSent. Specifically, we regard
+the tokens with larger attribution scores as potential triggers since larger
+attribution words contribute more to the false prediction results and therefore
+are more likely to be poison triggers. Additionally, we further utilize an
+external pre-trained language model to distinguish whether input is poisoned or
+not. We show that our proposed method can generalize sufficiently well in two
+common attack scenarios (poisoning training data and testing data), which
+consistently improves previous methods. For instance, AttDef can successfully
+mitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%
+(3.99% up) under pre-training and post-training attack defense respectively,
+achieving the new state-of-the-art performance on prediction recovery over four
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2023. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Multi-modal and Multi-hop Question Answering via Structured
+  Knowledge and Unified Retrieval-Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Yang, Qian Chen, Wen Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal multi-hop question answering involves answering a question by
+reasoning over multiple input sources from different modalities. Existing
+methods often retrieve evidences separately and then use a language model to
+generate an answer based on the retrieved evidences, and thus do not adequately
+connect candidates and are unable to model the interdependent relations during
+retrieval. Moreover, the pipelined approaches of retrieval and generation might
+result in poor generation performance when retrieval performance is low. To
+address these issues, we propose a Structured Knowledge and Unified
+Retrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion
+Encoder to align sources from different modalities using shared entities. It
+then uses a unified Retrieval-Generation Decoder to integrate intermediate
+retrieval results for answer generation and also adaptively determine the
+number of retrieval steps. Extensive experiments on two representative
+multi-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG
+outperforms the state-of-the-art models in both source retrieval and answer
+generation performance with fewer parameters. Our code is available at
+https://github.com/HITsz-TMG/SKURG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zhang, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Ramesh, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">117</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Motion Magnification: Visualizing Subtle Motions with Time Varying
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Y. Feng, Hadi Alzayer, Michael Rubinstein, William T. Freeman, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion magnification helps us visualize subtle, imperceptible motion.
+However, prior methods only work for 2D videos captured with a fixed camera. We
+present a 3D motion magnification method that can magnify subtle motions from
+scenes captured by a moving camera, while supporting novel view rendering. We
+represent the scene with time-varying radiance fields and leverage the Eulerian
+principle for motion magnification to extract and amplify the variation of the
+embedding of a fixed point over time. We study and validate our proposed
+principle for 3D motion magnification using both implicit and tri-plane-based
+radiance fields as our underlying 3D scene representation. We evaluate the
+effectiveness of our method on both synthetic and real-world scenes captured
+under various camera setups.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. See the project page at
+  https://3d-motion-magnification.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FSD V2: Improving Fully Sparse 3D Object Detection with Virtual Voxels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lue Fan, Feng Wang, Naiyan Wang, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR-based fully sparse architecture has garnered increasing attention.
+FSDv1 stands out as a representative work, achieving impressive efficacy and
+efficiency, albeit with intricate structures and handcrafted designs. In this
+paper, we present FSDv2, an evolution that aims to simplify the previous FSDv1
+while eliminating the inductive bias introduced by its handcrafted
+instance-level representation, thus promoting better general applicability. To
+this end, we introduce the concept of \textbf{virtual voxels}, which takes over
+the clustering-based instance segmentation in FSDv1. Virtual voxels not only
+address the notorious issue of the Center Feature Missing problem in fully
+sparse detectors but also endow the framework with a more elegant and
+streamlined approach. Consequently, we develop a suite of components to
+complement the virtual voxel concept, including a virtual voxel encoder, a
+virtual voxel mixer, and a virtual voxel assignment strategy. Through empirical
+validation, we demonstrate that the virtual voxel mechanism is functionally
+similar to the handcrafted clustering in FSDv1 while being more general. We
+conduct experiments on three large-scale datasets: Waymo Open Dataset,
+Argoverse 2 dataset, and nuScenes dataset. Our results showcase
+state-of-the-art performance on all three datasets, highlighting the
+superiority of FSDv2 in long-range scenarios and its general applicability to
+achieve competitive performance across diverse scenarios. Moreover, we provide
+comprehensive experimental analysis to elucidate the workings of FSDv2. To
+foster reproducibility and further research, we have open-sourced FSDv2 at
+https://github.com/tusen-ai/SST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mask Frozen-DETR: High Quality Instance Segmentation with One GPU 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhao Liang, Yuhui Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to study how to build a strong instance segmenter with
+minimal training time and GPUs, as opposed to the majority of current
+approaches that pursue more accurate instance segmenter by building more
+advanced frameworks at the cost of longer training time and higher GPU
+requirements. To achieve this, we introduce a simple and general framework,
+termed Mask Frozen-DETR, which can convert any existing DETR-based object
+detection model into a powerful instance segmentation model. Our method only
+requires training an additional lightweight mask network that predicts instance
+masks within the bounding boxes given by a frozen DETR-based object detector.
+Remarkably, our method outperforms the state-of-the-art instance segmentation
+method Mask DINO in terms of performance on the COCO test-dev split (55.3% vs.
+54.7%) while being over 10X times faster to train. Furthermore, all of our
+experiments can be trained using only one Tesla V100 GPU with 16 GB of memory,
+demonstrating the significant efficiency of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tiny LVLM-eHub: Early Multimodal Experiments with Bard 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Shao, Yutao Hu, Peng Gao, Meng Lei, Kaipeng Zhang, Fanqing Meng, Peng Xu, Siyuan Huang, Hongsheng Li, Yu Qiao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated
+significant progress in tackling complex multimodal tasks. Among these
+cutting-edge developments, Google's Bard stands out for its remarkable
+multimodal capabilities, promoting comprehensive comprehension and reasoning
+across various domains. This work presents an early and holistic evaluation of
+LVLMs' multimodal abilities, with a particular focus on Bard, by proposing a
+lightweight variant of LVLM-eHub, named Tiny LVLM-eHub. In comparison to the
+vanilla version, Tiny LVLM-eHub possesses several appealing properties.
+Firstly, it provides a systematic assessment of six categories of multimodal
+capabilities, including visual perception, visual knowledge acquisition, visual
+reasoning, visual commonsense, object hallucination, and embodied intelligence,
+through quantitative evaluation of $42$ standard text-related visual
+benchmarks. Secondly, it conducts an in-depth analysis of LVLMs' predictions
+using the ChatGPT Ensemble Evaluation (CEE), which leads to a robust and
+accurate evaluation and exhibits improved alignment with human evaluation
+compared to the word matching approach. Thirdly, it comprises a mere $2.1$K
+image-text pairs, facilitating ease of use for practitioners to evaluate their
+own offline LVLMs. Through extensive experimental analysis, this study
+demonstrates that Bard outperforms previous LVLMs in most multimodal
+capabilities except object hallucination, to which Bard is still susceptible.
+Tiny LVLM-eHub serves as a baseline evaluation for various LVLMs and encourages
+innovative strategies aimed at advancing multimodal techniques. Our project is
+publicly available at \url{https://github.com/OpenGVLab/Multi-Modality-Arena}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 24 figures, 7 Tables. Project Page:
+  http://lvlm-ehub.opengvlab.com/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaptiveSAM: Towards Efficient Tuning of SAM for Surgical Scene
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jay N. Paranjape, Nithin Gopalakrishnan Nair, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation is a fundamental problem in surgical scene analysis using
+artificial intelligence. However, the inherent data scarcity in this domain
+makes it challenging to adapt traditional segmentation techniques for this
+task. To tackle this issue, current research employs pretrained models and
+finetunes them on the given data. Even so, these require training deep networks
+with millions of parameters every time new data becomes available. A recently
+published foundation model, Segment-Anything (SAM), generalizes well to a large
+variety of natural images, hence tackling this challenge to a reasonable
+extent. However, SAM does not generalize well to the medical domain as is
+without utilizing a large amount of compute resources for fine-tuning and using
+task-specific prompts. Moreover, these prompts are in the form of
+bounding-boxes or foreground/background points that need to be annotated
+explicitly for every image, making this solution increasingly tedious with
+higher data size. In this work, we propose AdaptiveSAM - an adaptive
+modification of SAM that can adjust to new datasets quickly and efficiently,
+while enabling text-prompted segmentation. For finetuning AdaptiveSAM, we
+propose an approach called bias-tuning that requires a significantly smaller
+number of trainable parameters than SAM (less than 2\%). At the same time,
+AdaptiveSAM requires negligible expert intervention since it uses free-form
+text as prompt and can segment the object of interest with just the label name
+as prompt. Our experiments show that AdaptiveSAM outperforms current
+state-of-the-art methods on various medical imaging datasets including surgery,
+ultrasound and X-ray. Code is available at
+https://github.com/JayParanjape/biastuning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Temporal Sentence Grounding in Videos with Multi-Teacher
+  Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renjie Liang, Yiming Yang, Hui Lu, Li Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Sentence Grounding in Videos (TSGV) aims to detect the event
+timestamps described by the natural language query from untrimmed videos. This
+paper discusses the challenge of achieving efficient computation in TSGV models
+while maintaining high performance. Most existing approaches exquisitely design
+complex architectures to improve accuracy with extra layers and loss, suffering
+from inefficiency and heaviness. Although some works have noticed that, they
+only make an issue of feature fusion layers, which can hardly enjoy the
+highspeed merit in the whole clunky network. To tackle this problem, we propose
+a novel efficient multi-teacher model (EMTM) based on knowledge distillation to
+transfer diverse knowledge from both heterogeneous and isomorphic networks.
+Specifically, We first unify different outputs of the heterogeneous models into
+one single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire
+high-quality integrated soft labels from multiple teachers. After that, the KAU
+module leverages the multi-scale video and global query information to
+adaptively determine the weights of different teachers. A Shared Encoder
+strategy is then proposed to solve the problem that the student shallow layers
+hardly benefit from teachers, in which an isomorphic teacher is collaboratively
+trained with the student to align their hidden states. Extensive experimental
+results on three popular TSGV benchmarks demonstrate that our method is both
+effective and efficient without bells and whistles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dimensionality Reduction for Improving Out-of-Distribution Detection in
+  Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        McKell Woodland, Nihil Patel, Mais Al Taie, Joshua P. Yung, Tucker J. Netherton, Ankit B. Patel, Kristy K. Brock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinically deployed segmentation models are known to fail on data outside of
+their training distribution. As these models perform well on most cases, it is
+imperative to detect out-of-distribution (OOD) images at inference to protect
+against automation bias. This work applies the Mahalanobis distance post hoc to
+the bottleneck features of a Swin UNETR model that segments the liver on
+T1-weighted magnetic resonance imaging. By reducing the dimensions of the
+bottleneck features with principal component analysis, OOD images were detected
+with high performance and minimal computational load.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections. The Version of Record of this contribution will
+  be published in the Proceedings of Uncertainty for Safe Utilization of
+  Machine Learning in Medical Imaging (5th International Workshop) - Held in
+  conjunction with MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEM-GAT: Explainable Semantic Pose Estimation using Learned Graph
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Efimia Panagiotaki, Daniele De Martini, Georgi Pramatarov, Matthew Gadd, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a GNN-based method for exploiting semantics and local
+geometry to guide the identification of reliable pointcloud registration
+candidates. Semantic and morphological features of the environment serve as key
+reference points for registration, enabling accurate lidar-based pose
+estimation. Our novel lightweight static graph structure informs our
+attention-based keypoint node aggregation GNN network by identifying semantic
+instance-based relationships, acting as inductive bias to significantly reduce
+the computational burden of pointcloud registration. By connecting candidate
+nodes and exploiting cross-graph attention, we identify confidence scores for
+all potential registration correspondences, estimating the displacement between
+pointcloud scans. Our pipeline enables introspective analysis of the model's
+performance by correlating it with the individual contributions of local
+structures in the environment, providing valuable insights into the system's
+behaviour. We test our method on the KITTI odometry dataset, achieving
+competitive accuracy compared to benchmark methods and a higher track
+smoothness while relying on significantly fewer network parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Real Time Delineation of Supraclavicular Brachial Plexus in
+  Neck Ultrasonography Videos: A Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Tyagi, Abhishek Tyagi, Manpreet Kaur, Jayanthi Sivaswami, Richa Aggarwal, Kapil Dev Soni, Anjan Trikha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peripheral nerve blocks are crucial to treatment of post-surgical pain and
+are associated with reduction in perioperative opioid use and hospital stay.
+Accurate interpretation of sono-anatomy is critical for the success of
+ultrasound (US) guided peripheral nerve blocks and can be challenging to the
+new operators. This prospective study enrolled 227 subjects who were
+systematically scanned for supraclavicular and interscalene brachial plexus in
+various settings using three different US machines to create a dataset of 227
+unique videos. In total, 41,000 video frames were annotated by experienced
+anaesthesiologists using partial automation with object tracking and active
+contour algorithms. Four baseline neural network models were trained on the
+dataset and their performance was evaluated for object detection and
+segmentation tasks. Generalizability of the best suited model was then tested
+on the datasets constructed from separate US scanners with and without
+fine-tuning. The results demonstrate that deep learning models can be leveraged
+for real time segmentation of supraclavicular brachial plexus in neck
+ultrasonography videos with high accuracy and reliability. Model was also
+tested for its ability to differentiate between supraclavicular and adjoining
+interscalene brachial plexus. The entire dataset has been released publicly for
+further study by the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach human-level
+accuracy on ImageNet. Human-level competence is thus achievable for a
+fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 tables; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prototype Learning for Out-of-Distribution Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kumar Tomar, Debesh Jha, Ulas Bagci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing polyp segmentation models from colonoscopy images often fail to
+provide reliable segmentation results on datasets from different centers,
+limiting their applicability. Our objective in this study is to create a robust
+and well-generalized segmentation model named PrototypeLab that can assist in
+polyp segmentation. To achieve this, we incorporate various lighting modes such
+as White light imaging (WLI), Blue light imaging (BLI), Linked color imaging
+(LCI), and Flexible spectral imaging color enhancement (FICE) into our new
+segmentation model, that learns to create prototypes for each class of object
+present in the images. These prototypes represent the characteristic features
+of the objects, such as their shape, texture, color. Our model is designed to
+perform effectively on out-of-distribution (OOD) datasets from multiple
+centers. We first generate a coarse mask that is used to learn prototypes for
+the main object class, which are then employed to generate the final
+segmentation mask. By using prototypes to represent the main class, our
+approach handles the variability present in the medical images and generalize
+well to new data since prototype capture the underlying distribution of the
+data. PrototypeLab offers a promising solution with a dice coefficient of
+$\geq$ 90\% and mIoU $\geq$ 85\% with a near real-time processing speed for
+polyp segmentation. It achieved superior performance on OOD datasets compared
+to 16 state-of-the-art image segmentation architectures, potentially improving
+clinical outcomes. Codes are available at
+https://github.com/xxxxx/PrototypeLab.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-based Person Re-identification with Long Short-Term Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuehu Liu, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person Re-Identification (V-ReID) aims to retrieve specific
+persons from raw videos captured by non-overlapped cameras. As a fundamental
+task, it spreads many multimedia and computer vision applications. However, due
+to the variations of persons and scenes, there are still many obstacles that
+must be overcome for high performance. In this work, we notice that both the
+long-term and short-term information of persons are important for robust video
+representations. Thus, we propose a novel deep learning framework named Long
+Short-Term Representation Learning (LSTRL) for effective V-ReID. More
+specifically, to extract long-term representations, we propose a
+Multi-granularity Appearance Extractor (MAE), in which four granularity
+appearances are effectively captured across multiple frames. Meanwhile, to
+extract short-term representations, we propose a Bi-direction Motion Estimator
+(BME), in which reciprocal motion information is efficiently extracted from
+consecutive frames. The MAE and BME are plug-and-play and can be easily
+inserted into existing networks for efficient feature learning. As a result,
+they significantly improve the feature representation ability for V-ReID.
+Extensive experiments on three widely used benchmarks show that our proposed
+approach can deliver better performances than most state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ICIG2023, including 13 pages, 5 figures and
+  5 tables. Modifications may be performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Screen-based 3D Subjective Experiment Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn
+considerable efforts from academia and industry to assess their perceptual
+quality by conducting subjective experiments. However, lacking a handy software
+for 3D subjective experiments complicates the construction of 3D graphics
+quality assessment datasets, thus hindering the prosperity of relevant fields.
+In this paper, we develop a powerful platform with which users can flexibly
+design their 3D subjective methodologies and build high-quality datasets,
+easing a broad spectrum of 3D graphics subjective quality study. To accurately
+illustrate the perceptual quality differences of 3D stimuli, our software can
+simultaneously render the source stimulus and impaired stimulus and allows both
+stimuli to respond synchronously to viewer interactions. Compared with amateur
+3D visualization tool-based or image/video rendering-based schemes, our
+approach embodies typical 3D applications while minimizing cognitive overload
+during subjective experiments. We organized a subjective experiment involving
+40 participants to verify the validity of the proposed software. Experimental
+analyses demonstrate that subjective tests on our software can produce
+reasonable subjective quality scores of 3D models. All resources in this paper
+can be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Concise and Descriptive Attributes for Visual Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Yu Wang, Yiwu Zhong, Chengyu Dong, Zexue He, Yujie Lu, William Wang, Jingbo Shang, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in foundation models present new opportunities for
+interpretable visual recognition -- one can first query Large Language Models
+(LLMs) to obtain a set of attributes that describe each class, then apply
+vision-language models to classify images via these attributes. Pioneering work
+shows that querying thousands of attributes can achieve performance competitive
+with image features. However, our further investigation on 8 datasets reveals
+that LLM-generated attributes in a large quantity perform almost the same as
+random words. This surprising finding suggests that significant noise may be
+present in these attributes. We hypothesize that there exist subsets of
+attributes that can maintain the classification performance with much smaller
+sizes, and propose a novel learning-to-search method to discover those concise
+sets of attributes. As a result, on the CUB dataset, our method achieves
+performance close to that of massive LLM-generated attributes (e.g., 10k
+attributes for CUB), yet using only 32 attributes in total to distinguish 200
+bird species. Furthermore, our new paradigm demonstrates several additional
+benefits: higher interpretability and interactivity for humans, and the ability
+to summarize knowledge for a recognition task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving FHB Screening in Wheat Breeding Using an Efficient <span class="highlight-title">Transformer</span>
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Babak Azad, Ahmed Abdalla, Kwanghee Won, Ali Mirzakhani Nafchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fusarium head blight is a devastating disease that causes significant
+economic losses annually on small grains. Efficiency, accuracy, and timely
+detection of FHB in the resistance screening are critical for wheat and barley
+breeding programs. In recent years, various image processing techniques have
+been developed using supervised machine learning algorithms for the early
+detection of FHB. The state-of-the-art convolutional neural network-based
+methods, such as U-Net, employ a series of encoding blocks to create a local
+representation and a series of decoding blocks to capture the semantic
+relations. However, these methods are not often capable of long-range modeling
+dependencies inside the input data, and their ability to model multi-scale
+objects with significant variations in texture and shape is limited. Vision
+transformers as alternative architectures with innate global self-attention
+mechanisms for sequence-to-sequence prediction, due to insufficient low-level
+details, may also limit localization capabilities. To overcome these
+limitations, a new Context Bridge is proposed to integrate the local
+representation capability of the U-Net network in the transformer model. In
+addition, the standard attention mechanism of the original transformer is
+replaced with Efficient Self-attention, which is less complicated than other
+state-of-the-art methods. To train the proposed network, 12,000 wheat images
+from an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were
+captured. In addition to healthy and unhealthy plants, these images encompass
+various stages of the disease. A team of expert pathologists annotated the
+images for training and evaluating the developed model. As a result, the
+effectiveness of the transformer-based method for FHB-disease detection,
+through extensive experiments across typical tasks for plant image
+segmentation, is demonstrated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual
+  International Meeting conference in Omaha, Nebraska. Also available at
+  https://elibrary.asabe.org/abstract.asp?aid=54149</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FFF: Fragments-Guided Flexible Fitting for Building Complete Protein
+  Structures <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Chen, Xinyan Wang, Yuhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) is a technique for reconstructing the
+3-dimensional (3D) structure of biomolecules (especially large protein
+complexes and molecular assemblies). As the resolution increases to the
+near-atomic scale, building protein structures de novo from cryo-EM maps
+becomes possible. Recently, recognition-based de novo building methods have
+shown the potential to streamline this process. However, it cannot build a
+complete structure due to the low signal-to-noise ratio (SNR) problem. At the
+same time, AlphaFold has led to a great breakthrough in predicting protein
+structures. This has inspired us to combine fragment recognition and structure
+prediction methods to build a complete structure. In this paper, we propose a
+new method named FFF that bridges protein structure prediction and protein
+structure recognition with flexible fitting. First, a multi-level recognition
+network is used to capture various structural features from the input 3D
+cryo-EM map. Next, protein structural fragments are generated using pseudo
+peptide vectors and a protein sequence alignment method based on these
+extracted features. Finally, a complete structural model is constructed using
+the predicted protein fragments via flexible fitting. Based on our benchmark
+tests, FFF outperforms the baseline methods for building complete protein
+structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Proceedings of the IEEE/CVF Conference on Computer
+  Vision and Pattern Recognition (CVPR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WarpEM: Dynamic Time Warping for Accurate Catheter Registration in
+  EM-guided Procedures <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ardit Ramadani, Peter Ewert, Heribert Schunkert, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate catheter tracking is crucial during minimally invasive endovascular
+procedures (MIEP), and electromagnetic (EM) tracking is a widely used
+technology that serves this purpose. However, registration between preoperative
+images and the EM tracking system is often challenging. Existing registration
+methods typically require manual interactions, which can be time-consuming,
+increase the risk of errors and change the procedural workflow. Although
+several registration methods are available for catheter tracking, such as
+marker-based and path-based approaches, their limitations can impact the
+accuracy of the resulting tracking solution, consequently, the outcome of the
+medical procedure.
+  This paper introduces a novel automated catheter registration method for
+EM-guided MIEP. The method utilizes 3D signal temporal analysis, such as
+Dynamic Time Warping (DTW) algorithms, to improve registration accuracy and
+reliability compared to existing methods. DTW can accurately warp and match
+EM-tracked paths to the vessel's centerline, making it particularly suitable
+for registration. The introduced registration method is evaluated for accuracy
+in a vascular phantom using a marker-based registration as the ground truth.
+The results indicate that the DTW method yields accurate and reliable
+registration outcomes, with a mean error of $2.22$mm. The introduced
+registration method presents several advantages over state-of-the-art methods,
+such as high registration accuracy, no initialization required, and increased
+automation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 26th International Conference on Medical Image Computing and
+  Computer Assisted Intervention, MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segmentation Framework for Heat Loss Identification in Thermal Images:
+  Empowering Scottish Retrofitting and Thermographic <span class="highlight-title">Survey</span> Companies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Junayed Hasan, Eyad Elyan, Yijun Yan, Jinchang Ren, Md Mostafa Kamal Sarker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrofitting and thermographic survey (TS) companies in Scotland collaborate
+with social housing providers to tackle fuel poverty. They employ ground-level
+infrared (IR) camera-based-TSs (GIRTSs) for collecting thermal images to
+identi-fy the heat loss sources resulting from poor insulation. However, this
+identifica-tion process is labor-intensive and time-consuming, necessitating
+extensive data processing. To automate this, an AI-driven approach is
+necessary. Therefore, this study proposes a deep learning (DL)-based
+segmentation framework using the Mask Region Proposal Convolutional Neural
+Network (Mask RCNN) to validate its applicability to these thermal images. The
+objective of the framework is to au-tomatically identify, and crop heat loss
+sources caused by weak insulation, while also eliminating obstructive objects
+present in those images. By doing so, it min-imizes labor-intensive tasks and
+provides an automated, consistent, and reliable solution. To validate the
+proposed framework, approximately 2500 thermal imag-es were collected in
+collaboration with industrial TS partner. Then, 1800 repre-sentative images
+were carefully selected with the assistance of experts and anno-tated to
+highlight the target objects (TO) to form the final dataset. Subsequently, a
+transfer learning strategy was employed to train the dataset, progressively
+aug-menting the training data volume and fine-tuning the pre-trained baseline
+Mask RCNN. As a result, the final fine-tuned model achieved a mean average
+precision (mAP) score of 77.2% for segmenting the TO, demonstrating the
+significant po-tential of proposed framework in accurately quantifying energy
+loss in Scottish homes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 3 Figures, Accepted from the conference - BICS 2023: 2023
+  International Conference on Brain-Inspired Cognitive Systems Kuala Lumpur,
+  Malaysia, August 5-6, 2023 [peer-reviewed]</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOMA-Force: Visual-Force Imitation for Real-World Mobile Manipulation <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taozheng Yang, Ya Jing, Hongtao Wu, Jiafeng Xu, Kuankuan Sima, Guangzeng Chen, Qie Sima, Tao Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel method for mobile manipulators to perform
+multiple contact-rich manipulation tasks. While learning-based methods have the
+potential to generate actions in an end-to-end manner, they often suffer from
+insufficient action accuracy and robustness against noise. On the other hand,
+classical control-based methods can enhance system robustness, but at the cost
+of extensive parameter tuning. To address these challenges, we present
+MOMA-Force, a visual-force imitation method that seamlessly combines
+representation learning for perception, imitation learning for complex motion
+generation, and admittance whole-body control for system robustness and
+controllability. MOMA-Force enables a mobile manipulator to learn multiple
+complex contact-rich tasks with high success rates and small contact forces. In
+a real household setting, our method outperforms baseline methods in terms of
+task success rates. Moreover, our method achieves smaller contact forces and
+smaller force variances compared to baseline methods without force imitation.
+Overall, we offer a promising approach for efficient and robust mobile
+manipulation in the real world. Videos and more details can be found on
+\url{https://visual-force-imitation.github.io}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Visual <span class="highlight-title">Pre-train</span>ing for Robot Manipulation: <span class="highlight-title">Dataset</span>s, Models
+  and Methods <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ya Jing, Xuelin Zhu, Xingbin Liu, Qie Sima, Taozheng Yang, Yunhai Feng, Tao Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual pre-training with large-scale real-world data has made great progress
+in recent years, showing great potential in robot learning with pixel
+observations. However, the recipes of visual pre-training for robot
+manipulation tasks are yet to be built. In this paper, we thoroughly
+investigate the effects of visual pre-training strategies on robot manipulation
+tasks from three fundamental perspectives: pre-training datasets, model
+architectures and training methods. Several significant experimental findings
+are provided that are beneficial for robot learning. Further, we propose a
+visual pre-training scheme for robot manipulation termed Vi-PRoM, which
+combines self-supervised learning and supervised learning. Concretely, the
+former employs contrastive learning to acquire underlying patterns from
+large-scale unlabeled data, while the latter aims learning visual semantics and
+temporal dynamics. Extensive experiments on robot manipulations in various
+simulation environments and the real robot demonstrate the superiority of the
+proposed scheme. Videos and more details can be found on
+\url{https://explore-pretrain-robot.github.io}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous
+  Labels <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengming Lin, Yan Xia, Nishant Ravikumar, Qiongyao Liu, Michael MacRaild, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of brain vessels is crucial for cerebrovascular disease
+diagnosis and treatment. However, existing methods face challenges in capturing
+small vessels and handling datasets that are partially or ambiguously
+annotated. In this paper, we propose an adaptive semi-supervised approach to
+address these challenges. Our approach incorporates innovative techniques
+including progressive semi-supervised learning, adaptative training strategy,
+and boundary enhancement. Experimental results on 3DRA datasets demonstrate the
+superiority of our method in terms of mesh-based segmentation metrics. By
+leveraging the partially and ambiguously labeled data, which only annotates the
+main vessels, our method achieves impressive segmentation performance on
+mislabeled fine vessels, showcasing its potential for clinical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by DALI MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AvatarVerse: High-quality & Stable 3D Avatar Creation from Text and Pose 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huichao Zhang, Bowen Chen, Hao Yang, Liao Qu, Xu Wang, Li Chen, Chao Long, Feida Zhu, Kang Du, Min Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating expressive, diverse and high-quality 3D avatars from highly
+customized text descriptions and pose guidance is a challenging task, due to
+the intricacy of modeling and texturing in 3D that ensure details and various
+styles (realistic, fictional, etc). We present AvatarVerse, a stable pipeline
+for generating expressive high-quality 3D avatars from nothing but text
+descriptions and pose guidance. In specific, we introduce a 2D diffusion model
+conditioned on DensePose signal to establish 3D pose control of avatars through
+2D images, which enhances view consistency from partially observed scenarios.
+It addresses the infamous Janus Problem and significantly stablizes the
+generation process. Moreover, we propose a progressive high-resolution 3D
+synthesis strategy, which obtains substantial improvement over the quality of
+the created 3D avatars. To this end, the proposed AvatarVerse pipeline achieves
+zero-shot 3D modeling of 3D avatars that are not only more expressive, but also
+in higher quality and fidelity than previous works. Rigorous qualitative
+evaluations and user studies showcase AvatarVerse's superiority in synthesizing
+high-fidelity 3D avatars, leading to a new standard in high-quality and stable
+3D avatar creation. Our project page is: https://avatarverse3d.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent <span class="highlight-title">Self-Supervised</span> Video Denoising with Denser Receptive Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichun Wang, Yulun Zhang, Debing Zhang, Ying Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised video denoising has seen decent progress through the use of
+blind spot networks. However, under their blind spot constraints, previous
+self-supervised video denoising methods suffer from significant information
+loss and texture destruction in either the whole reference frame or neighbor
+frames, due to their inadequate consideration of the receptive field. Moreover,
+the limited number of available neighbor frames in previous methods leads to
+the discarding of distant temporal information. Nonetheless, simply adopting
+existing recurrent frameworks does not work, since they easily break the
+constraints on the receptive field imposed by self-supervision. In this paper,
+we propose RDRF for self-supervised video denoising, which not only fully
+exploits both the reference and neighbor frames with a denser receptive field,
+but also better leverages the temporal information from both local and distant
+neighbor features. First, towards a comprehensive utilization of information
+from both reference and neighbor frames, RDRF realizes a denser receptive field
+by taking more neighbor pixels along the spatial and temporal dimensions.
+Second, it features a self-supervised recurrent video denoising framework,
+which concurrently integrates distant and near-neighbor temporal features. This
+enables long-term bidirectional information aggregation, while mitigating error
+accumulation in the plain recurrent framework. Our method exhibits superior
+performance on both synthetic and real video denoising datasets. Codes will be
+available at https://github.com/Wang-XIaoDingdd/RDRF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FeatEnHancer: Enhancing Hierarchical Features for Object Detection and
+  Beyond Under Low-Light Vision <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khurram Azeem Hashmi, Goutham Kallempudi, Didier Stricker, Muhammamd Zeshan Afzal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting useful visual cues for the downstream tasks is especially
+challenging under low-light vision. Prior works create enhanced representations
+by either correlating visual quality with machine perception or designing
+illumination-degrading transformation methods that require pre-training on
+synthetic datasets. We argue that optimizing enhanced image representation
+pertaining to the loss of the downstream task can result in more expressive
+representations. Therefore, in this work, we propose a novel module,
+FeatEnHancer, that hierarchically combines multiscale features using
+multiheaded attention guided by task-related loss function to create suitable
+representations. Furthermore, our intra-scale enhancement improves the quality
+of features extracted at each scale or level, as well as combines features from
+different scales in a way that reflects their relative importance for the task
+at hand. FeatEnHancer is a general-purpose plug-and-play module and can be
+incorporated into any low-light vision pipeline. We show with extensive
+experimentation that the enhanced representation produced with FeatEnHancer
+significantly and consistently improves results in several low-light vision
+tasks, including dark object detection (+5.7 mAP on ExDark), face detection
+(+1.5 mAPon DARK FACE), nighttime semantic segmentation (+5.1 mIoU on ACDC ),
+and video object detection (+1.8 mAP on DarkVision), highlighting the
+effectiveness of enhancing hierarchical features under low-light vision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 Figures, and 10 Tables. Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoilNet: An Attention-based Spatio-temporal Deep Learning Framework for
+  Soil Organic Carbon Prediction with Digital Soil Mapping in Europe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nafiseh Kakhani, Moien Rangzan, Ali Jamali, Sara Attarchi, Seyed Kazem Alavipanah, Thomas Scholten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital soil mapping (DSM) is an advanced approach that integrates
+statistical modeling and cutting-edge technologies, including machine learning
+(ML) methods, to accurately depict soil properties and their spatial
+distribution. Soil organic carbon (SOC) is a crucial soil attribute providing
+valuable insights into soil health, nutrient cycling, greenhouse gas emissions,
+and overall ecosystem productivity. This study highlights the significance of
+spatial-temporal deep learning (DL) techniques within the DSM framework. A
+novel architecture is proposed, incorporating spatial information using a base
+convolutional neural network (CNN) model and spatial attention mechanism, along
+with climate temporal information using a long short-term memory (LSTM)
+network, for SOC prediction across Europe. The model utilizes a comprehensive
+set of environmental features, including Landsat-8 images, topography, remote
+sensing indices, and climate time series, as input features. Results
+demonstrate that the proposed framework outperforms conventional ML approaches
+like random forest commonly used in DSM, yielding lower root mean square error
+(RMSE). This model is a robust tool for predicting SOC and could be applied to
+other soil properties, thereby contributing to the advancement of DSM
+techniques and facilitating land management and decision-making processes based
+on accurate information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revealing the Underlying Patterns: Investigating <span class="highlight-title">Dataset</span> Similarity,
+  Performance, and Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshit Achara, Ram Krishna Pandey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised deep learning models require significant amount of labelled data
+to achieve an acceptable performance on a specific task. However, when tested
+on unseen data, the models may not perform well. Therefore, the models need to
+be trained with additional and varying labelled data to improve the
+generalization. In this work, our goal is to understand the models, their
+performance and generalization. We establish image-image, dataset-dataset, and
+image-dataset distances to gain insights into the model's behavior. Our
+proposed distance metric when combined with model performance can help in
+selecting an appropriate model/architecture from a pool of candidate
+architectures. We have shown that the generalization of these models can be
+improved by only adding a small number of unseen images (say 1, 3 or 7) into
+the training set. Our proposed approach reduces training and annotation costs
+while providing an estimate of model performance on unseen data in dynamic
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Decoupling-Recycling Network for Fast Interactive Segmentation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huimin Zeng, Weinong Wang, Xin Tao, Zhiwei Xiong, Yu-Wing Tai, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent interactive segmentation methods iteratively take source image, user
+guidance and previously predicted mask as the input without considering the
+invariant nature of the source image. As a result, extracting features from the
+source image is repeated in each interaction, resulting in substantial
+computational redundancy. In this work, we propose the Feature
+Decoupling-Recycling Network (FDRN), which decouples the modeling components
+based on their intrinsic discrepancies and then recycles components for each
+user interaction. Thus, the efficiency of the whole interactive process can be
+significantly improved. To be specific, we apply the Decoupling-Recycling
+strategy from three perspectives to address three types of discrepancies,
+respectively. First, our model decouples the learning of source image semantics
+from the encoding of user guidance to process two types of input domains
+separately. Second, FDRN decouples high-level and low-level features from
+stratified semantic representations to enhance feature learning. Third, during
+the encoding of user guidance, current user guidance is decoupled from
+historical guidance to highlight the effect of current user guidance. We
+conduct extensive experiments on 6 datasets from different domains and
+modalities, which demonstrate the following merits of our model: 1) superior
+efficiency than other methods, particularly advantageous in challenging
+scenarios requiring long-term interactions (up to 4.25x faster), while
+achieving favorable segmentation performance; 2) strong applicability to
+various methods serving as a universal enhancement technique; 3) well
+cross-task generalizability, e.g., to medical image segmentation, and
+robustness against misleading user guidance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keyword Spotting Simplified: A Segmentation-Free Approach using
+  Character Counting and CTC re-scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Retsinas, Giorgos Sfikas, Christophoros Nikou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in segmentation-free keyword spotting treat this problem
+w.r.t. an object detection paradigm and borrow from state-of-the-art detection
+systems to simultaneously propose a word bounding box proposal mechanism and
+compute a corresponding representation. Contrary to the norm of such methods
+that rely on complex and large DNN models, we propose a novel segmentation-free
+system that efficiently scans a document image to find rectangular areas that
+include the query information. The underlying model is simple and compact,
+predicting character occurrences over rectangular areas through an implicitly
+learned scale map, trained on word-level annotated images. The proposed
+document scanning is then performed using this character counting in a
+cost-effective manner via integral images and binary search. Finally, the
+retrieval similarity by character counting is refined by a pyramidal
+representation and a CTC-based re-scoring algorithm, fully utilizing the
+trained CNN model. Experimental validation on two widely-used datasets shows
+that our method achieves state-of-the-art results outperforming the more
+complex alternatives, despite the simplicity of the underlying model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Face <span class="highlight-title">Dataset</span>: Guiding StyleGAN to Generate Labeled Synthetic
+  Face Image <span class="highlight-title">Dataset</span> for Underrepresented Group 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a machine learning model to generalize effectively to unseen data within
+a particular problem domain, it is well-understood that the data needs to be of
+sufficient size and representative of real-world scenarios. Nonetheless,
+real-world datasets frequently have overrepresented and underrepresented
+groups. One solution to mitigate bias in machine learning is to leverage a
+diverse and representative dataset. Training a model on a dataset that covers
+all demographics is crucial to reducing bias in machine learning. However,
+collecting and labeling large-scale datasets has been challenging, prompting
+the use of synthetic data generation and active labeling to decrease the costs
+of manual labeling. The focus of this study was to generate a robust face image
+dataset using the StyleGAN model. In order to achieve a balanced distribution
+of the dataset among different demographic groups, a synthetic dataset was
+created by controlling the generation process of StyleGaN and annotated for
+different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures,submitted to AMLD Africa 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Photometric Feature Transform for Free-form Object Scan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Feng, Kaizhang Kang, Fan Pei, Huakeng Ding, Jinjiang You, Ping Tan, Kun Zhou, Hongzhi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel framework to automatically learn to aggregate and
+transform photometric measurements from multiple unstructured views into
+spatially distinctive and view-invariant low-level features, which are fed to a
+multi-view stereo method to enhance 3D reconstruction. The illumination
+conditions during acquisition and the feature transform are jointly trained on
+a large amount of synthetic data. We further build a system to reconstruct the
+geometry and anisotropic reflectance of a variety of challenging objects from
+hand-held scans. The effectiveness of the system is demonstrated with a
+lightweight prototype, consisting of a camera and an array of LEDs, as well as
+an off-the-shelf tablet. Our results are validated against reconstructions from
+a professional 3D scanner and photographs, and compare favorably with
+state-of-the-art techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Mass Detection in Mammography Images: A Study of Weakly
+  Supervised Learning and Class Activation Map Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicente Sampaio, Filipe R. Cordeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, weakly supervised models have aided in mass detection using
+mammography images, decreasing the need for pixel-level annotations. However,
+most existing models in the literature rely on Class Activation Maps (CAM) as
+the activation method, overlooking the potential benefits of exploring other
+activation techniques. This work presents a study that explores and compares
+different activation maps in conjunction with state-of-the-art methods for
+weakly supervised training in mammography images. Specifically, we investigate
+CAM, GradCAM, GradCAM++, XGradCAM, and LayerCAM methods within the framework of
+the GMIC model for mass detection in mammography images. The evaluation is
+conducted on the VinDr-Mammo dataset, utilizing the metrics Accuracy, True
+Positive Rate (TPR), False Negative Rate (FNR), and False Positive Per Image
+(FPPI). Results show that using different strategies of activation maps during
+training and test stages leads to an improvement of the model. With this
+strategy, we improve the results of the GMIC method, decreasing the FPPI value
+and increasing TPR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at SIBGRAPI 20203</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Physical World Adversarial Robustness of Vehicle Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Tianyuan Zhang, Shuangcheng Liu, Weiyu Ji, Zichao Zhang, Gang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks can compromise the robustness of real-world detection
+models. However, evaluating these models under real-world conditions poses
+challenges due to resource-intensive experiments. Virtual simulations offer an
+alternative, but the absence of standardized benchmarks hampers progress.
+Addressing this, we propose an innovative instant-level data generation
+pipeline using the CARLA simulator. Through this pipeline, we establish the
+Discrete and Continuous Instant-level (DCI) dataset, enabling comprehensive
+experiments involving three detection models and three physical adversarial
+attacks. Our findings highlight diverse model performances under adversarial
+conditions. Yolo v6 demonstrates remarkable resilience, experiencing just a
+marginal 6.59% average drop in average precision (AP). In contrast, the ASA
+attack yields a substantial 14.51% average AP reduction, twice the effect of
+other algorithms. We also note that static scenes yield higher recognition AP
+values, and outcomes remain relatively consistent across varying weather
+conditions. Intriguingly, our study suggests that advancements in adversarial
+attack algorithms may be approaching its ``limitation''.In summary, our work
+underscores the significance of adversarial attacks in real-world contexts and
+introduces the DCI dataset as a versatile benchmark. Our findings provide
+valuable insights for enhancing the robustness of detection models and offer
+guidance for future research endeavors in the realm of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Detection: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohail Ahmed Khan, Duc-Tien Dang-Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper present a comprehensive comparative analysis of supervised and
+self-supervised models for deepfake detection. We evaluate eight supervised
+deep learning architectures and two transformer-based models pre-trained using
+self-supervised strategies (DINO, CLIP) on four benchmarks (FakeAVCeleb,
+CelebDF-V2, DFDC, and FaceForensics++). Our analysis includes intra-dataset and
+inter-dataset evaluations, examining the best performing models, generalisation
+capabilities, and impact of augmentations. We also investigate the trade-off
+between model size and performance. Our main goal is to provide insights into
+the effectiveness of different deep learning architectures (transformers,
+CNNs), training strategies (supervised, self-supervised), and deepfake
+detection benchmarks. These insights can help guide the development of more
+accurate and reliable deepfake detection systems, which are crucial in
+mitigating the harmful impact of deepfakes on individuals and society.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoadScan: A Novel and Robust Transfer Learning Framework for Autonomous
+  Pothole Detection in Roads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad Parasnis, Anmol Chokshi, Kailas Devadkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper presents a novel approach to pothole detection using Deep
+Learning and Image Processing techniques. The proposed system leverages the
+VGG16 model for feature extraction and utilizes a custom Siamese network with
+triplet loss, referred to as RoadScan. The system aims to address the critical
+issue of potholes on roads, which pose significant risks to road users.
+Accidents due to potholes on the roads have led to numerous accidents. Although
+it is necessary to completely remove potholes, it is a time-consuming process.
+Hence, a general road user should be able to detect potholes from a safe
+distance in order to avoid damage. Existing methods for pothole detection
+heavily rely on object detection algorithms which tend to have a high chance of
+failure owing to the similarity in structures and textures of a road and a
+pothole. Additionally, these systems utilize millions of parameters thereby
+making the model difficult to use in small-scale applications for the general
+citizen. By analyzing diverse image processing methods and various
+high-performing networks, the proposed model achieves remarkable performance in
+accurately detecting potholes. Evaluation metrics such as accuracy, EER,
+precision, recall, and AUROC validate the effectiveness of the system.
+Additionally, the proposed model demonstrates computational efficiency and
+cost-effectiveness by utilizing fewer parameters and data for training. The
+research highlights the importance of technology in the transportation sector
+and its potential to enhance road safety and convenience. The network proposed
+in this model performs with a 96.12 % accuracy, 3.89 % EER, and a 0.988 AUROC
+value, which is highly competitive with other state-of-the-art works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Silo Prototypical Calibration for Federated Learning with Non-IID
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Qi, Lei Meng, Zitan Chen, Han Hu, Hui Lin, Xiangxu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning aims to learn a global model on the server side that
+generalizes to all clients in a privacy-preserving manner, by leveraging the
+local models from different clients. Existing solutions focus on either
+regularizing the objective functions among clients or improving the aggregation
+mechanism for the improved model generalization capability. However, their
+performance is typically limited by the dataset biases, such as the
+heterogeneous data distributions and the missing classes. To address this
+issue, this paper presents a cross-silo prototypical calibration method
+(FedCSPC), which takes additional prototype information from the clients to
+learn a unified feature space on the server side. Specifically, FedCSPC first
+employs the Data Prototypical Modeling (DPM) module to learn data patterns via
+clustering to aid calibration. Subsequently, the cross-silo prototypical
+calibration (CSPC) module develops an augmented contrastive learning method to
+improve the robustness of the calibration, which can effectively project
+cross-source features into a consistent space while maintaining clear decision
+boundaries. Moreover, the CSPC module's ease of implementation and
+plug-and-play characteristics make it even more remarkable. Experiments were
+conducted on four datasets in terms of performance comparison, ablation study,
+in-depth analysis and case study, and the results verified that FedCSPC is
+capable of learning the consistent features across different data sources of
+the same class under the guidance of calibrated model, which leads to better
+performance than the state-of-the-art methods. The source codes have been
+released at https://github.com/qizhuang-qz/FedCSPC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for
+  RAW Denoising <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Jin, Jia-Wen Xiao, Ling-Hao Han, Chunle Guo, Ruixun Zhang, Xialei Liu, Chongyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Calibration-based methods have dominated RAW image denoising under extremely
+low-light environments. However, these methods suffer from several main
+deficiencies: 1) the calibration procedure is laborious and time-consuming, 2)
+denoisers for different cameras are difficult to transfer, and 3) the
+discrepancy between synthetic noise and real noise is enlarged by high digital
+gain. To overcome the above shortcomings, we propose a calibration-free
+pipeline for Lighting Every Drakness (LED), regardless of the digital gain or
+camera sensor. Instead of calibrating the noise parameters and training
+repeatedly, our method could adapt to a target camera only with few-shot paired
+data and fine-tuning. In addition, well-designed structural modification during
+both stages alleviates the domain gap between synthetic and real noise without
+any extra computational cost. With 2 pairs for each additional digital gain (in
+total 6 pairs) and 0.5% iterations, our method achieves superior performance
+over other calibration-based methods. Our code is available at
+https://github.com/Srameo/LED .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaFET: Learning Geometry-aware Facial Expression Translation from
+  In-The-Wild Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiang Ma, Bingchuan Li, Qian He, Jing Dong, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While current face animation methods can manipulate expressions individually,
+they suffer from several limitations. The expressions manipulated by some
+motion-based facial reenactment models are crude. Other ideas modeled with
+facial action units cannot generalize to arbitrary expressions not covered by
+annotations. In this paper, we introduce a novel Geometry-aware Facial
+Expression Translation (GaFET) framework, which is based on parametric 3D
+facial representations and can stably decoupled expression. Among them, a
+Multi-level Feature Aligned Transformer is proposed to complement non-geometric
+facial detail features while addressing the alignment challenge of spatial
+features. Further, we design a De-expression model based on StyleGAN, in order
+to reduce the learning difficulty of GaFET in unpaired "in-the-wild" images.
+Extensive qualitative and quantitative experiments demonstrate that we achieve
+higher-quality and more accurate facial expression transfer results compared to
+state-of-the-art methods, and demonstrate applicability of various poses and
+complex textures. Besides, videos or annotated training data are omitted,
+making our method easier to use and generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Horse with no Labels: <span class="highlight-title">Self-Supervised</span> Horse Pose Estimation from
+  Unlabelled Images and Synthetic Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Sosa, David Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining labelled data to train deep learning methods for estimating animal
+pose is challenging. Recently, synthetic data has been widely used for pose
+estimation tasks, but most methods still rely on supervised learning paradigms
+utilising synthetic images and labels. Can training be fully unsupervised? Is a
+tiny synthetic dataset sufficient? What are the minimum assumptions that we
+could make for estimating animal pose? Our proposal addresses these questions
+through a simple yet effective self-supervised method that only assumes the
+availability of unlabelled images and a small set of synthetic 2D poses. We
+completely remove the need for any 3D or 2D pose annotations (or complex 3D
+animal models), and surprisingly our approach can still learn accurate 3D and
+2D poses simultaneously. We train our method with unlabelled images of horses
+mainly collected for YouTube videos and a prior consisting of 2D synthetic
+poses. The latter is three times smaller than the number of images needed for
+training. We test our method on a challenging set of horse images and evaluate
+the predicted 3D and 2D poses. We demonstrate that it is possible to learn
+accurate animal poses even with as few assumptions as unlabelled images and a
+small set of 2D poses generated from synthetic data. Given the minimum
+requirements and the abundance of unlabelled data, our method could be easily
+deployed to different animals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiT: Efficient Vision <span class="highlight-title">Transformer</span>s with Dynamic Token Routing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Ma, Zhengcong Fei, Junshi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the tokens of images share the same static data flow in many dense
+networks. However, challenges arise from the variance among the objects in
+images, such as large variations in the spatial scale and difficulties of
+recognition for visual entities. In this paper, we propose a data-dependent
+token routing strategy to elaborate the routing paths of image tokens for
+Dynamic Vision Transformer, dubbed DiT. The proposed framework generates a
+data-dependent path per token, adapting to the object scales and visual
+discrimination of tokens. In feed-forward, the differentiable routing gates are
+designed to select the scaling paths and feature transformation paths for image
+tokens, leading to multi-path feature propagation. In this way, the impact of
+object scales and visual discrimination of image representation can be
+carefully tuned. Moreover, the computational cost can be further reduced by
+giving budget constraints to the routing gate and early-stopping of feature
+extraction. In experiments, our DiT achieves superior performance and favorable
+complexity/accuracy trade-offs than many SoTA methods on ImageNet
+classification, object detection, instance segmentation, and semantic
+segmentation. Particularly, the DiT-B5 obtains 84.8\% top-1 Acc on ImageNet
+with 10.3 GFLOPs, which is 1.0\% higher than that of the SoTA method with
+similar computational complexity. These extensive results demonstrate that DiT
+can serve as versatile backbones for various vision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatially Varying Nanophotonic Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixuan Wei, Xiao Li, Johannes Froech, Praneeth Chakravarthula, James Whitehead, Ethan Tseng, Arka Majumdar, Felix Heide
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explosive growth of computation and energy cost of artificial
+intelligence has spurred strong interests in new computing modalities as
+potential alternatives to conventional electronic processors. Photonic
+processors that execute operations using photons instead of electrons, have
+promised to enable optical neural networks with ultra-low latency and power
+consumption. However, existing optical neural networks, limited by the
+underlying network designs, have achieved image recognition accuracy much lower
+than state-of-the-art electronic neural networks. In this work, we close this
+gap by introducing a large-kernel spatially-varying convolutional neural
+network learned via low-dimensional reparameterization techniques. We
+experimentally instantiate the network with a flat meta-optical system that
+encompasses an array of nanophotonic structures designed to induce
+angle-dependent responses. Combined with an extremely lightweight electronic
+backend with approximately 2K parameters we demonstrate a nanophotonic neural
+network reaches 73.80\% blind test classification accuracy on CIFAR-10 dataset,
+and, as such, the first time, an optical neural network outperforms the first
+modern digital neural network -- AlexNet (72.64\%) with 57M parameters,
+bringing optical neural network into modern deep learning era.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bilevel Generative Learning for Low-Light Vision <span class="chip">ACM MM'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingchi Liu, Zhu Liu, Long Ma, Jinyuan Liu, Xin Fan, Zhongxuan Luo, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a growing interest in constructing deep learning
+schemes for Low-Light Vision (LLV). Existing techniques primarily focus on
+designing task-specific and data-dependent vision models on the standard RGB
+domain, which inherently contain latent data associations. In this study, we
+propose a generic low-light vision solution by introducing a generative block
+to convert data from the RAW to the RGB domain. This novel approach connects
+diverse vision problems by explicitly depicting data generation, which is the
+first in the field. To precisely characterize the latent correspondence between
+the generative procedure and the vision task, we establish a bilevel model with
+the parameters of the generative block defined as the upper level and the
+parameters of the vision task defined as the lower level. We further develop
+two types of learning strategies targeting different goals, namely low cost and
+high accuracy, to acquire a new bilevel generative learning paradigm. The
+generative blocks embrace a strong generalization ability in other low-light
+vision tasks through the bilevel optimization on enhancement tasks. Extensive
+experimental evaluations on three representative low-light vision tasks, namely
+enhancement, detection, and segmentation, fully demonstrate the superiority of
+our proposed approach. The code will be available at
+https://github.com/Yingchi1998/BGL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM'2023, The code will be available at
+  https://github.com/Yingchi1998/BGL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VR-based body tracking to stimulate musculoskeletal training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Neidhardt, S. Gerlach F. N. Schmidt, I. A. K. Fiedler, S. Grube, B. Busse, A. Schlaefer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training helps to maintain and improve sufficient muscle function, body
+control, and body coordination. These are important to reduce the risk of
+fracture incidents caused by falls, especially for the elderly or people
+recovering from injury. Virtual reality training can offer a cost-effective and
+individualized training experience. We present an application for the HoloLens
+2 to enable musculoskeletal training for elderly and impaired persons to allow
+for autonomous training and automatic progress evaluation. We designed a
+virtual downhill skiing scenario that is controlled by body movement to
+stimulate balance and body control. By adapting the parameters of the ski
+slope, we can tailor the intensity of the training to individual users. In this
+work, we evaluate whether the movement data of the HoloLens 2 alone is
+sufficient to control and predict body movement and joint angles during
+musculoskeletal training. We record the movements of 10 healthy volunteers with
+external tracking cameras and track a set of body and joint angles of the
+participant during training. We estimate correlation coefficients and
+systematically analyze whether whole body movement can be derived from the
+movement data of the HoloLens 2. No participant reports movement sickness
+effects and all were able to quickly interact and control their movement during
+skiing. Our results show a high correlation between HoloLens 2 movement data
+and the external tracking of the upper body movement and joint angles of the
+lower limbs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Forgetting Compensation for Class-Incremental Learning <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahua Dong, Wenqi Liang, Yang Cong, Gan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning (CIL) has achieved remarkable successes in
+learning new classes consecutively while overcoming catastrophic forgetting on
+old categories. However, most existing CIL methods unreasonably assume that all
+old categories have the same forgetting pace, and neglect negative influence of
+forgetting heterogeneity among different old classes on forgetting
+compensation. To surmount the above challenges, we develop a novel
+Heterogeneous Forgetting Compensation (HFC) model, which can resolve
+heterogeneous forgetting of easy-to-forget and hard-to-forget old categories
+from both representation and gradient aspects. Specifically, we design a
+task-semantic aggregation block to alleviate heterogeneous forgetting from
+representation aspect. It aggregates local category information within each
+task to learn task-shared global representations. Moreover, we develop two
+novel plug-and-play losses: a gradient-balanced forgetting compensation loss
+and a gradient-balanced relation distillation loss to alleviate forgetting from
+gradient aspect. They consider gradient-balanced compensation to rectify
+forgetting heterogeneity of old categories and heterogeneous relation
+consistency. Experiments on several representative datasets illustrate
+effectiveness of our HFC model. The code is available at
+https://github.com/JiahuaDong/HFC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Aggregation <span class="highlight-title">Transformer</span> for Image Super-Resolution <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chen, Yulun Zhang, Jinjin Gu, Linghe Kong, Xiaokang Yang, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer has recently gained considerable popularity in low-level vision
+tasks, including image super-resolution (SR). These networks utilize
+self-attention along different dimensions, spatial or channel, and achieve
+impressive performance. This inspires us to combine the two dimensions in
+Transformer for a more powerful representation capability. Based on the above
+idea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),
+for image SR. Our DAT aggregates features across spatial and channel
+dimensions, in the inter-block and intra-block dual manner. Specifically, we
+alternately apply spatial and channel self-attention in consecutive Transformer
+blocks. The alternate strategy enables DAT to capture the global context and
+realize inter-block feature aggregation. Furthermore, we propose the adaptive
+interaction module (AIM) and the spatial-gate feed-forward network (SGFN) to
+achieve intra-block feature aggregation. AIM complements two self-attention
+mechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional
+non-linear spatial information in the feed-forward network. Extensive
+experiments show that our DAT surpasses current methods. Code and models are
+obtainable at https://github.com/zhengchen1999/DAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Code is available at
+  https://github.com/zhengchen1999/DAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distortion-aware <span class="highlight-title">Transformer</span> in 360° Salient Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinjie Zhao, Lichen Zhao, Qian Yu, Jing Zhang, Lu Sheng, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of VR and AR, 360{\deg} data attracts increasing attention
+from the computer vision and multimedia communities. Typically, 360{\deg} data
+is projected into 2D ERP (equirectangular projection) images for feature
+extraction. However, existing methods cannot handle the distortions that result
+from the projection, hindering the development of 360-data-based tasks.
+Therefore, in this paper, we propose a Transformer-based model called DATFormer
+to address the distortion problem. We tackle this issue from two perspectives.
+Firstly, we introduce two distortion-adaptive modules. The first is a
+Distortion Mapping Module, which guides the model to pre-adapt to distorted
+features globally. The second module is a Distortion-Adaptive Attention Block
+that reduces local distortions on multi-scale features. Secondly, to exploit
+the unique characteristics of 360{\deg} data, we present a learnable relation
+matrix and use it as part of the positional embedding to further improve
+performance. Extensive experiments are conducted on three public datasets, and
+the results show that our model outperforms existing 2D SOD (salient object
+detection) and 360 SOD methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-Guided Diffusion Model for CBCT-to-CT Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linjie Fu, Xia Li, Xiuding Cai, Dong Miao, Yu Yao, Yali Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cone Beam CT (CBCT) plays a crucial role in Adaptive Radiation Therapy (ART)
+by accurately providing radiation treatment when organ anatomy changes occur.
+However, CBCT images suffer from scatter noise and artifacts, making relying
+solely on CBCT for precise dose calculation and accurate tissue localization
+challenging. Therefore, there is a need to improve CBCT image quality and
+Hounsfield Unit (HU) accuracy while preserving anatomical structures. To
+enhance the role and application value of CBCT in ART, we propose an
+energy-guided diffusion model (EGDiff) and conduct experiments on a chest tumor
+dataset to generate synthetic CT (sCT) from CBCT. The experimental results
+demonstrate impressive performance with an average absolute error of
+26.87$\pm$6.14 HU, a structural similarity index measurement of 0.850$\pm$0.03,
+a peak signal-to-noise ratio of the sCT of 19.83$\pm$1.39 dB, and a normalized
+cross-correlation of the sCT of 0.874$\pm$0.04. These results indicate that our
+method outperforms state-of-the-art unsupervised synthesis methods in accuracy
+and visual quality, producing superior sCT images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering
+  <span class="highlight-title">Dataset</span> for Scientific Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhi Li, Nima Tajbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present SciGraphQA, a synthetic multi-turn question-answer
+dataset related to academic graphs. SciGraphQA is 13 times larger than
+ChartVQA, the previously largest chart-visual question-answering dataset. It is
+also the largest open-sourced chart VQA dataset with non-synthetic charts. To
+build our dataset, we selected 290,000 Computer Science or Machine Learning
+ArXiv papers published between 2010 and 2020, and then used Palm-2 to generate
+295K samples of open-vocabulary multi-turn question-answering dialogues about
+the graphs. As context, we provided the text-only Palm-2 with paper title,
+abstract, paragraph mentioning the graph, and rich text contextual data from
+the graph itself, obtaining dialogues with an average 2.23 question-answer
+turns for each graph. We asked GPT-4 to assess the matching quality of our
+question-answer turns given the paper's context, obtaining an average rating of
+8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most
+popular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our
+dataset, finding LLaVA-13B being the most performant with a CIDEr score of
+0.08. We further enriched the question prompts for LLAVA by including the
+serialized data tables extracted from the graphs using the DePlot model,
+boosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,
+we also fine-tuned LLaVa using our dataset, reaching a substantially higher
+CIDEr score of 0.26. We anticipate further accuracy improvement by including
+segmentation mask tokens and leveraging larger LLM backbones coupled with
+emergent prompting techniques. Our code and data are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Colorization: Exploring Latent Cross-Domain Priors for NIR
+  Image Spectrum Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxing Yang, Jie Chen, Zaifeng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Near-infrared (NIR) image spectrum translation is a challenging problem with
+many promising applications. Existing methods struggle with the mapping
+ambiguity between the NIR and the RGB domains, and generalize poorly due to the
+limitations of models' learning capabilities and the unavailability of
+sufficient NIR-RGB image pairs for training. To address these challenges, we
+propose a cooperative learning paradigm that colorizes NIR images in parallel
+with another proxy grayscale colorization task by exploring latent cross-domain
+priors (i.e., latent spectrum context priors and task domain priors), dubbed
+CoColor. The complementary statistical and semantic spectrum information from
+these two task domains -- in the forms of pre-trained colorization networks --
+are brought in as task domain priors. A bilateral domain translation module is
+subsequently designed, in which intermittent NIR images are generated from
+grayscale and colorized in parallel with authentic NIR images; and vice versa
+for the grayscale images. These intermittent transformations act as latent
+spectrum context priors for efficient domain knowledge exchange. We
+progressively fine-tune and fuse these modules with a series of pixel-level and
+feature-level consistency constraints. Experiments show that our proposed
+cooperative learning framework produces satisfactory spectrum translation
+outputs with diverse colors and rich textures, and outperforms state-of-the-art
+counterparts by 3.95dB and 4.66dB in terms of PNSR for the NIR and grayscale
+colorization tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid CNN-<span class="highlight-title">Transformer</span> Architecture with Frequency Domain Contrastive
+  Learning for Image Deraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Wang, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image deraining is a challenging task that involves restoring degraded images
+affected by rain streaks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Part-Aware <span class="highlight-title">Transformer</span> for Generalizable Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Ni, Yuke Li, Heng Tao Shen, Jingkuan Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization person re-identification (DG-ReID) aims to train a
+model on source domains and generalize well on unseen domains. Vision
+Transformer usually yields better generalization ability than common CNN
+networks under distribution shifts. However, Transformer-based ReID models
+inevitably over-fit to domain-specific biases due to the supervised learning
+strategy on the source domain. We observe that while the global images of
+different IDs should have different features, their similar local parts (e.g.,
+black backpack) are not bounded by this constraint. Motivated by this, we
+propose a pure Transformer model (termed Part-aware Transformer) for DG-ReID by
+designing a proxy task, named Cross-ID Similarity Learning (CSL), to mine local
+visual information shared by different IDs. This proxy task allows the model to
+learn generic features because it only cares about the visual similarity of the
+parts regardless of the ID labels, thus alleviating the side effect of
+domain-specific biases. Based on the local similarity obtained in CSL, a
+Part-guided Self-Distillation (PSD) is proposed to further improve the
+generalization of global features. Our method achieves state-of-the-art
+performance under most DG ReID settings. Under the Market$\to$Duke setting, our
+method exceeds state-of-the-art by 10.9% and 12.8% in Rank1 and mAP,
+respectively. The code is available at
+https://github.com/liyuke65535/Part-Aware-Transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Zhou, Huanran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning is inseparable from normalization layers.
+Researchers have proposed various normalization functions, and each of them has
+both advantages and disadvantages. In response, efforts have been made to
+design a unified normalization function that combines all normalization
+procedures and mitigates their weaknesses. We also proposed a new normalization
+function called Adaptive Fusion Normalization. Through experiments, we
+demonstrate AFN outperforms the previous normalization techniques in domain
+generalization and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2106.01899 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Dotzel, Gang Wu, Andrew Li, Muhammad Umar, Yun Ni, Mohamed S. Abdelfattah, Zhiru Zhang, Liqun Cheng, Martin G. Dixon, Norman P. Jouppi, Quoc V. Le, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has become a mainstream compression technique for reducing model
+size, computational requirements, and energy consumption for modern deep neural
+networks (DNNs). With the improved numerical support in recent hardware,
+including multiple variants of integer and floating point, mixed-precision
+quantization has become necessary to achieve high-quality results with low
+model cost. Prior mixed-precision quantization methods have performed a
+post-training quantization search, which compromises on accuracy, or a
+differentiable quantization search, which leads to high memory usage from
+branching. Therefore, we propose the first one-shot mixed-precision
+quantization search that eliminates the need for retraining in both integer and
+low-precision floating point models. We evaluate our floating-point and integer
+quantization search (FLIQS) on multiple convolutional networks and vision
+transformer models to discover Pareto-optimal models. Our approach discovers
+models that improve upon uniform precision, manual mixed-precision, and recent
+integer quantization search methods. With the proposed integer quantization
+search, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and
+ResNet-50 by 0.90% points with equivalent model cost over previous methods.
+Additionally, for the first time, we explore a novel mixed-precision
+floating-point search and improve MobileNetV2 by up to 0.98% points compared to
+prior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously
+search a joint quantization and neural architecture space and improve the
+ImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2
+search space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Label <span class="highlight-title">Self-Supervised</span> Learning with Scene Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhu, Minghao Fu, Jianxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) methods targeting scene images have seen a
+rapid growth recently, and they mostly rely on either a dedicated dense
+matching mechanism or a costly unsupervised object discovery module. This paper
+shows that instead of hinging on these strenuous operations, quality image
+representations can be learned by treating scene/multi-label image SSL simply
+as a multi-label classification problem, which greatly simplifies the learning
+framework. Specifically, multiple binary pseudo-labels are assigned for each
+input image by comparing its embeddings with those in two dictionaries, and the
+network is optimized using the binary cross entropy loss. The proposed method
+is named Multi-Label Self-supervised learning (MLS). Visualizations
+qualitatively show that clearly the pseudo-labels by MLS can automatically find
+semantically similar pseudo-positive pairs across different images to
+facilitate contrastive learning. MLS learns high quality representations on
+MS-COCO and achieves state-of-the-art results on classification, detection and
+segmentation benchmarks. At the same time, MLS is much simpler than existing
+methods, making it easier to deploy and for further exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Environment-Invariant Curriculum Relation Learning for Fine-Grained
+  Scene Graph Generation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukuan Min, Aming Wu, Cheng Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scene graph generation (SGG) task is designed to identify the predicates
+based on the subject-object pairs.However,existing datasets generally include
+two imbalance cases: one is the class imbalance from the predicted predicates
+and another is the context imbalance from the given subject-object pairs, which
+presents significant challenges for SGG. Most existing methods focus on the
+imbalance of the predicted predicate while ignoring the imbalance of the
+subject-object pairs, which could not achieve satisfactory results. To address
+the two imbalance cases, we propose a novel Environment Invariant Curriculum
+Relation learning (EICR) method, which can be applied in a plug-and-play
+fashion to existing SGG methods. Concretely, to remove the imbalance of the
+subject-object pairs, we first construct different distribution environments
+for the subject-object pairs and learn a model invariant to the environment
+changes. Then, we construct a class-balanced curriculum learning strategy to
+balance the different environments to remove the predicate imbalance.
+Comprehensive experiments conducted on VG and GQA datasets demonstrate that our
+EICR framework can be taken as a general strategy for various SGG models, and
+achieve significant improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by
+  other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with
+  Whitted-Style Ray Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyi Zeng, Chong Bao, Rui Chen, Zilong Dong, Guofeng Zhang, Hujun Bao, Zhaopeng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Neural Radiance Fields (NeRF) has exhibited significant success in
+novel view synthesis, surface reconstruction, etc. However, since no physical
+reflection is considered in its rendering pipeline, NeRF mistakes the
+reflection in the mirror as a separate virtual scene, leading to the inaccurate
+reconstruction of the mirror and multi-view inconsistent reflections in the
+mirror. In this paper, we present a novel neural rendering framework, named
+Mirror-NeRF, which is able to learn accurate geometry and reflection of the
+mirror and support various scene manipulation applications with mirrors, such
+as adding new objects or mirrors into the scene and synthesizing the
+reflections of these new objects in mirrors, controlling mirror roughness, etc.
+To achieve this goal, we propose a unified radiance field by introducing the
+reflection probability and tracing rays following the light transport model of
+Whitted Ray Tracing, and also develop several techniques to facilitate the
+learning process. Experiments and comparisons on both synthetic and real
+datasets demonstrate the superiority of our method. The code and supplementary
+material are available on the project webpage:
+https://zju3dv.github.io/Mirror-NeRF/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023. Project Page:
+  https://zju3dv.github.io/Mirror-NeRF/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatialyze: A Geospatial Video Analytics System with Spatial-Aware
+  Optimizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanwut Kittivorawong, Yongming Ge, Yousef Helal, Alvin Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videos that are shot using commodity hardware such as phones and surveillance
+cameras record various metadata such as time and location. We encounter such
+geospatial videos on a daily basis and such videos have been growing in volume
+significantly. Yet, we do not have data management systems that allow users to
+interact with such data effectively.
+  In this paper, we describe Spatialyze, a new framework for end-to-end
+querying of geospatial videos. Spatialyze comes with a domain-specific language
+where users can construct geospatial video analytic workflows using a 3-step,
+declarative, build-filter-observe paradigm. Internally, Spatialyze leverages
+the declarative nature of such workflows, the temporal-spatial metadata stored
+with videos, and physical behavior of real-world objects to optimize the
+execution of workflows. Our results using real-world videos and workflows show
+that Spatialyze can reduce execution time by up to 5.3x, while maintaining up
+to 97.1% accuracy compared to unoptimized execution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature-Suppressed Contrast for <span class="highlight-title">Self-Supervised</span> Food <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinda Liu, Yaohui Zhu, Linhu Liu, Jiang Tian, Lili Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most previous approaches for analyzing food images have relied on extensively
+annotated datasets, resulting in significant human labeling expenses due to the
+varied and intricate nature of such images. Inspired by the effectiveness of
+contrastive self-supervised methods in utilizing unlabelled data, weiqing
+explore leveraging these techniques on unlabelled food images. In contrastive
+self-supervised methods, two views are randomly generated from an image by data
+augmentations. However, regarding food images, the two views tend to contain
+similar informative contents, causing large mutual information, which impedes
+the efficacy of contrastive self-supervised learning. To address this problem,
+we propose Feature Suppressed Contrast (FeaSC) to reduce mutual information
+between views. As the similar contents of the two views are salient or highly
+responsive in the feature map, the proposed FeaSC uses a response-aware scheme
+to localize salient features in an unsupervised manner. By suppressing some
+salient features in one view while leaving another contrast view unchanged, the
+mutual information between the two views is reduced, thereby enhancing the
+effectiveness of contrast learning for self-supervised food pre-training. As a
+plug-and-play module, the proposed method consistently improves BYOL and
+SimSiam by 1.70\% $\sim$ 6.69\% classification accuracy on four publicly
+available food recognition datasets. Superior results have also been achieved
+on downstream segmentation tasks, demonstrating the effectiveness of the
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redundancy-aware <span class="highlight-title">Transformer</span> for Video Question Answering <span class="chip">ACM MM23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicong Li, Xun Yang, An Zhang, Chun Feng, Xiang Wang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper identifies two kinds of redundancy in the current VideoQA
+paradigm. Specifically, the current video encoders tend to holistically embed
+all video clues at different granularities in a hierarchical manner, which
+inevitably introduces \textit{neighboring-frame redundancy} that can overwhelm
+detailed visual clues at the object level. Subsequently, prevailing
+vision-language fusion designs introduce the \textit{cross-modal redundancy} by
+exhaustively fusing all visual elements with question tokens without explicitly
+differentiating their pairwise vision-language interactions, thus making a
+pernicious impact on the answering.
+  To this end, we propose a novel transformer-based architecture, that aims to
+model VideoQA in a redundancy-aware manner. To address the neighboring-frame
+redundancy, we introduce a video encoder structure that emphasizes the
+object-level change in neighboring frames, while adopting an out-of-neighboring
+message-passing scheme that imposes attention only on distant frames. As for
+the cross-modal redundancy, we equip our fusion module with a novel adaptive
+sampling, which explicitly differentiates the vision-language interactions by
+identifying a small subset of visual elements that exclusively support the
+answer. Upon these advancements, we find this
+\underline{R}edundancy-\underline{a}ware trans\underline{former} (RaFormer) can
+achieve state-of-the-art results on multiple VideoQA benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Benchmark for Chinese-English Scene Text Image Super-resolution <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqi Ma, Zhetong Liang, Wangmeng Xiang, Xi Yang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Text Image Super-resolution (STISR) aims to recover high-resolution
+(HR) scene text images with visually pleasant and readable text content from
+the given low-resolution (LR) input. Most existing works focus on recovering
+English texts, which have relatively simple character structures, while little
+work has been done on the more challenging Chinese texts with diverse and
+complex character structures. In this paper, we propose a real-world
+Chinese-English benchmark dataset, namely Real-CE, for the task of STISR with
+the emphasis on restoring structurally complex Chinese characters. The
+benchmark provides 1,935/783 real-world LR-HR text image pairs~(contains 33,789
+text lines in total) for training/testing in 2$\times$ and 4$\times$ zooming
+modes, complemented by detailed annotations, including detection boxes and text
+transcripts. Moreover, we design an edge-aware learning method, which provides
+structural supervision in image and feature domains, to effectively reconstruct
+the dense structures of Chinese characters. We conduct experiments on the
+proposed Real-CE benchmark and evaluate the existing STISR models with and
+without our edge-aware loss. The benchmark, including data and source code, is
+available at https://github.com/mjq11302010044/Real-CE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ APBench: A Unified Benchmark for Availability Poisoning Attacks and
+  Defenses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianrui Qin, Xitong Gao, Juanjuan Zhao, Kejiang Ye, Cheng-Zhong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficacy of availability poisoning, a method of poisoning data by
+injecting imperceptible perturbations to prevent its use in model training, has
+been a hot subject of investigation. Previous research suggested that it was
+difficult to effectively counteract such poisoning attacks. However, the
+introduction of various defense methods has challenged this notion. Due to the
+rapid progress in this field, the performance of different novel methods cannot
+be accurately validated due to variations in experimental setups. To further
+evaluate the attack and defense capabilities of these poisoning methods, we
+have developed a benchmark -- APBench for assessing the efficacy of adversarial
+poisoning. APBench consists of 9 state-of-the-art availability poisoning
+attacks, 8 defense algorithms, and 4 conventional data augmentation techniques.
+We also have set up experiments with varying different poisoning ratios, and
+evaluated the attacks on multiple datasets and their transferability across
+model architectures. We further conducted a comprehensive evaluation of 2
+additional attacks specifically targeting unsupervised models. Our results
+reveal the glaring inadequacy of existing attacks in safeguarding individual
+privacy. APBench is open source and available to the deep learning community:
+https://github.com/lafeat/apbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning a Graph Neural Network with Cross Modality Interaction for
+  Image Fusion <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Li, Jiansheng Chen, Jinyuan Liu, Huimin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared and visible image fusion has gradually proved to be a vital fork in
+the field of multi-modality imaging technologies. In recent developments,
+researchers not only focus on the quality of fused images but also evaluate
+their performance in downstream tasks. Nevertheless, the majority of methods
+seldom put their eyes on the mutual learning from different modalities,
+resulting in fused images lacking significant details and textures. To overcome
+this issue, we propose an interactive graph neural network (GNN)-based
+architecture between cross modality for fusion, called IGNet. Specifically, we
+first apply a multi-scale extractor to achieve shallow features, which are
+employed as the necessary input to build graph structures. Then, the graph
+interaction module can construct the extracted intermediate features of the
+infrared/visible branch into graph structures. Meanwhile, the graph structures
+of two branches interact for cross-modality and semantic learning, so that
+fused images can maintain the important feature expressions and enhance the
+performance of downstream tasks. Besides, the proposed leader nodes can improve
+information propagation in the same modality. Finally, we merge all graph
+features to get the fusion result. Extensive experiments on different datasets
+(TNO, MFNet and M3FD) demonstrate that our IGNet can generate visually
+appealing fused images while scoring averagely 2.59% mAP@.5 and 7.77% mIoU
+higher in detection and segmentation than the compared state-of-the-art
+methods. The source code of the proposed IGNet can be available at
+https://github.com/lok-18/IGNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mind the Gap: Improving Success Rate of Vision-and-Language Navigation
+  by Revisiting Oracle Success Routes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongyang Zhao, Yuankai Qi, Qi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-Language Navigation (VLN) aims to navigate to the target location
+by following a given instruction. Unlike existing methods focused on predicting
+a more accurate action at each step in navigation, in this paper, we make the
+first attempt to tackle a long-ignored problem in VLN: narrowing the gap
+between Success Rate (SR) and Oracle Success Rate (OSR). We observe a
+consistently large gap (up to 9%) on four state-of-the-art VLN methods across
+two benchmark datasets: R2R and REVERIE. The high OSR indicates the robot agent
+passes the target location, while the low SR suggests the agent actually fails
+to stop at the target location at last. Instead of predicting actions directly,
+we propose to mine the target location from a trajectory given by off-the-shelf
+VLN models. Specially, we design a multi-module transformer-based model for
+learning compact discriminative trajectory viewpoint representation, which is
+used to predict the confidence of being a target location as described in the
+instruction. The proposed method is evaluated on three widely-adopted datasets:
+R2R, REVERIE and NDH, and shows promising results, demonstrating the potential
+for more future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Skeleton-based Action Recognition via Mutual Information
+  Estimation and Maximization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot skeleton-based action recognition aims to recognize actions of
+unseen categories after training on data of seen categories. The key is to
+build the connection between visual and semantic space from seen to unseen
+classes. Previous studies have primarily focused on encoding sequences into a
+singular feature vector, with subsequent mapping the features to an identical
+anchor point within the embedded space. Their performance is hindered by 1) the
+ignorance of the global visual/semantic distribution alignment, which results
+in a limitation to capture the true interdependence between the two spaces. 2)
+the negligence of temporal information since the frame-wise features with rich
+action clues are directly pooled into a single feature vector. We propose a new
+zero-shot skeleton-based action recognition method via mutual information (MI)
+estimation and maximization. Specifically, 1) we maximize the MI between visual
+and semantic space for distribution alignment; 2) we leverage the temporal
+information for estimating the MI by encouraging MI to increase as more frames
+are observed. Extensive experiments on three large-scale skeleton action
+datasets confirm the effectiveness of our method. Code:
+https://github.com/YujieOuO/SMIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deterministic Neural Illumination Mapping for Efficient Auto-White
+  Balance Correction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furkan Kınlı, Doğa Yılmaz, Barış Özcan, Furkan Kıraç
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Auto-white balance (AWB) correction is a critical operation in image signal
+processors for accurate and consistent color correction across various
+illumination scenarios. This paper presents a novel and efficient AWB
+correction method that achieves at least 35 times faster processing with
+equivalent or superior performance on high-resolution images for the current
+state-of-the-art methods. Inspired by deterministic color style transfer, our
+approach introduces deterministic illumination color mapping, leveraging
+learnable projection matrices for both canonical illumination form and
+AWB-corrected output. It involves feeding high-resolution images and
+corresponding latent representations into a mapping module to derive a
+canonical form, followed by another mapping module that maps the pixel values
+to those for the corrected version. This strategy is designed as
+resolution-agnostic and also enables seamless integration of any pre-trained
+AWB network as the backbone. Experimental results confirm the effectiveness of
+our approach, revealing significant performance improvements and reduced time
+complexity compared to state-of-the-art methods. Our method provides an
+efficient deep learning-based AWB correction solution, promising real-time,
+high-quality color correction for digital imaging applications. Source code is
+available at https://github.com/birdortyedi/DeNIM/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, ICCV 2023 Workshops (RCV 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the
+  Generalization of Histopathology Image Classification Across Unseen Hospitals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milad Sikaroudi, Shahryar Rahnamayan, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an exhaustive methodology that leverages all levels of feature
+abstraction, targeting an enhancement in the generalizability of image
+classification to unobserved hospitals. Our approach incorporates
+augmentation-based self-supervision with common distribution shifts in
+histopathology scenarios serving as the pretext task. This enables us to derive
+invariant features from training images without relying on training labels,
+thereby covering different abstraction levels. Moving onto the subsequent
+abstraction level, we employ a domain alignment module to facilitate further
+extraction of invariant features across varying training hospitals. To
+represent the highly specific features of participating hospitals, an encoder
+is trained to classify hospital labels, independent of their diagnostic labels.
+The features from each of these encoders are subsequently disentangled to
+minimize redundancy and segregate the features. This representation, which
+spans a broad spectrum of semantic information, enables the development of a
+model demonstrating increased robustness to unseen images from disparate
+distributions. Experimental results from the PACS dataset (a domain
+generalization benchmark), a synthetic dataset created by applying
+histopathology-specific jitters to the MHIST dataset (defining different
+domains with varied distribution shifts), and a Renal Cell Carcinoma dataset
+derived from four image repositories from TCGA, collectively indicate that our
+proposed model is adept at managing varying levels of image granularity. Thus,
+it shows improved generalizability when faced with new, out-of-distribution
+hospital images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings
+  for Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumyabrata Chaudhuri, Saumik Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Action Recognition (VAR) is a challenging task due to its inherent
+complexities. Though different approaches have been explored in the literature,
+designing a unified framework to recognize a large number of human actions is
+still a challenging problem. Recently, Multi-Modal Learning (MML) has
+demonstrated promising results in this domain. In literature, 2D skeleton or
+pose modality has often been used for this task, either independently or in
+conjunction with the visual information (RGB modality) present in videos.
+However, the combination of pose, visual information, and text attributes has
+not been explored yet, though text and pose attributes independently have been
+proven to be effective in numerous computer vision tasks. In this paper, we
+present the first pose augmented Vision-language model (VLM) for VAR. Notably,
+our scheme achieves an accuracy of 92.81% and 73.02% on two popular human video
+action recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even
+without any video data pre-training, and an accuracy of 96.11% and 75.75% after
+kinetics pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal
+  Backdoored Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Indranil Sur, Karan Sikka, Matthew Walmer, Kaushik Koneripalli, Anirban Roy, Xiao Lin, Ajay Divakaran, Susmit Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Multimodal Backdoor Defense technique TIJO (Trigger Inversion
+using Joint Optimization). Recent work arXiv:2112.07668 has demonstrated
+successful backdoor attacks on multimodal models for the Visual Question
+Answering task. Their dual-key backdoor trigger is split across two modalities
+(image and text), such that the backdoor is activated if and only if the
+trigger is present in both modalities. We propose TIJO that defends against
+dual-key attacks through a joint optimization that reverse-engineers the
+trigger in both the image and text modalities. This joint optimization is
+challenging in multimodal models due to the disconnected nature of the visual
+pipeline which consists of an offline feature extractor, whose output is then
+fused with the text using a fusion module. The key insight enabling the joint
+optimization in TIJO is that the trigger inversion needs to be carried out in
+the object detection box feature space as opposed to the pixel space. We
+demonstrate the effectiveness of our method on the TrojVQA benchmark, where
+TIJO improves upon the state-of-the-art unimodal methods from an AUC of 0.6 to
+0.92 on multimodal dual-key backdoors. Furthermore, our method also improves
+upon the unimodal baselines on unimodal backdoors. We present ablation studies
+and qualitative results to provide insights into our algorithm such as the
+critical importance of overlaying the inverted feature triggers on all visual
+features during trigger inversion. The prototype implementation of TIJO is
+available at https://github.com/SRI-CSL/TIJO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as conference paper at ICCV 2023. 13 pages, 6 figures, 7
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developability Approximation for Neural Implicits through Rank
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratheba Selvaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developability refers to the process of creating a surface without any
+tearing or shearing from a two-dimensional plane. It finds practical
+applications in the fabrication industry. An essential characteristic of a
+developable 3D surface is its zero Gaussian curvature, which means that either
+one or both of the principal curvatures are zero. This paper introduces a
+method for reconstructing an approximate developable surface from a neural
+implicit surface. The central idea of our method involves incorporating a
+regularization term that operates on the second-order derivatives of the neural
+implicits, effectively promoting zero Gaussian curvature. Implicit surfaces
+offer the advantage of smoother deformation with infinite resolution,
+overcoming the high polygonal constraints of state-of-the-art methods using
+discrete representations. We draw inspiration from the properties of surface
+curvature and employ rank minimization techniques derived from compressed
+sensing. Experimental results on both developable and non-developable surfaces,
+including those affected by noise, validate the generalizability of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Sky to the Ground: A Large-scale Benchmark and Simple Baseline
+  Towards Real Rain Removal <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Guo, Xueyao Xiao, Yi Chang, Shumin Deng, Luxin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based image deraining methods have made great progress. However, the
+lack of large-scale high-quality paired training samples is the main bottleneck
+to hamper the real image deraining (RID). To address this dilemma and advance
+RID, we construct a Large-scale High-quality Paired real rain benchmark
+(LHP-Rain), including 3000 video sequences with 1 million high-resolution
+(1920*1080) frame pairs. The advantages of the proposed dataset over the
+existing ones are three-fold: rain with higher-diversity and larger-scale,
+image with higher-resolution and higher-quality ground-truth. Specifically, the
+real rains in LHP-Rain not only contain the classical rain
+streak/veiling/occlusion in the sky, but also the \textbf{splashing on the
+ground} overlooked by deraining community. Moreover, we propose a novel robust
+low-rank tensor recovery model to generate the GT with better separating the
+static background from the dynamic rain. In addition, we design a simple
+transformer-based single image deraining baseline, which simultaneously utilize
+the self-attention and cross-layer attention within the image and rain layer
+with discriminative feature representation. Extensive experiments verify the
+superiority of the proposed dataset and deraining method over state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DefCor-Net: Physics-Aware Ultrasound Deformation Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongliang Jiang, Yue Zhou, Dongliang Cao, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recovery of morphologically accurate anatomical images from deformed ones
+is challenging in ultrasound (US) image acquisition, but crucial to accurate
+and consistent diagnosis, particularly in the emerging field of
+computer-assisted diagnosis. This article presents a novel anatomy-aware
+deformation correction approach based on a coarse-to-fine, multi-scale deep
+neural network (DefCor-Net). To achieve pixel-wise performance, DefCor-Net
+incorporates biomedical knowledge by estimating pixel-wise stiffness online
+using a U-shaped feature extractor. The deformation field is then computed
+using polynomial regression by integrating the measured force applied by the US
+probe. Based on real-time estimation of pixel-by-pixel tissue properties, the
+learning-based approach enables the potential for anatomy-aware deformation
+correction. To demonstrate the effectiveness of the proposed DefCor-Net, images
+recorded at multiple locations on forearms and upper arms of six volunteers are
+used to train and validate DefCor-Net. The results demonstrate that DefCor-Net
+can significantly improve the accuracy of deformation correction to recover the
+original geometry (Dice Coefficient: from $14.3\pm20.9$ to $82.6\pm12.1$ when
+the force is $6N$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MedIA. code is available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Throughput and Accurate 3D Scanning of Cattle Using Time-of-Flight
+  Sensors and Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gbenga Omotara, Seyed Mohamad Ali Tousi, Jared Decker, Derek Brake, Guilherme N. DeSouza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a high throughput 3D scanning solution specifically designed to
+precisely measure cattle phenotypes. This scanner leverages an array of depth
+sensors, i.e. time-of-flight (Tof) sensors, each governed by dedicated embedded
+devices. The system excels at generating high-fidelity 3D point clouds, thus
+facilitating an accurate mesh that faithfully reconstructs the cattle geometry
+on the fly. In order to evaluate the performance of our system, we have
+implemented a two-fold validation process. Initially, we test the scanner's
+competency in determining volume and surface area measurements within a
+controlled environment featuring known objects. Secondly, we explore the impact
+and necessity of multi-device synchronization when operating a series of
+time-of-flight sensors. Based on the experimental results, the proposed system
+is capable of producing high-quality meshes of untamed cattle for livestock
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Multi-scale <span class="highlight-title">Transformer</span> for High-Resolution Salient Object
+  Detection <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Deng, Pingping Zhang, Wei Liu, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) aims to identify and segment the most
+conspicuous objects in an image or video. As an important pre-processing step,
+it has many potential applications in multimedia and vision tasks. With the
+advance of imaging devices, SOD with high-resolution images is of great demand,
+recently. However, traditional SOD methods are largely limited to
+low-resolution images, making them difficult to adapt to the development of
+High-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no
+large enough datasets for training and evaluating. Besides, current HRSOD
+methods generally produce incomplete object regions and irregular object
+boundaries. To address above issues, in this work, we first propose a new
+HRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K
+resolution. As far as we know, it is the largest dataset for the HRSOD task,
+which will significantly help future works in training and evaluating models.
+Furthermore, to improve the HRSOD performance, we propose a novel Recurrent
+Multi-scale Transformer (RMFormer), which recurrently utilizes shared
+Transformers and multi-scale refinement architectures. Thus, high-resolution
+saliency maps can be generated with the guidance of lower-resolution
+predictions. Extensive experiments on both high-resolution and low-resolution
+benchmarks show the effectiveness and superiority of the proposed framework.
+The source code and dataset are released at:
+https://github.com/DrowsyMon/RMFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ACM MM2023. More modifications may be
+  performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Classification on a Data Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Feuer, Ameya Joshi, Minh Pham, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real world uses of deep learning require predictable model behavior under
+distribution shifts. Models such as CLIP show emergent natural distributional
+robustness comparable to humans, but may require hundreds of millions of
+training samples. Can we train robust learners in a domain where data is
+limited? To rigorously address this question, we introduce JANuS (Joint
+Annotations and Names Set), a collection of four new training datasets with
+images, labels, and corresponding captions, and perform a series of carefully
+controlled investigations of factors contributing to robustness in image
+classification, then compare those results to findings derived from a
+large-scale meta-analysis. Using this approach, we show that standard ResNet-50
+trained with the cross-entropy loss on 2.4 million image samples can attain
+comparable robustness to a CLIP ResNet-50 trained on 400 million samples. To
+our knowledge, this is the first result showing (near) state-of-the-art
+distributional robustness on limited data budgets. Our dataset is available at
+\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used
+to reproduce our experiments can be found at
+\url{https://github.com/penfever/vlhub/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; openreview link:
+  https://openreview.net/forum?id=D5Z2E8CNsD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CUTS: A Fully Unsupervised Framework for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11359v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11359v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Matthew Amodio, Liangbo L. Shen, Feng Gao, Arman Avesta, Sanjay Aneja, Jay C. Wang, Lucian V. Del Priore, Smita Krishnaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we introduce CUTS (Contrastive and Unsupervised Training for
+Segmentation), a fully unsupervised deep learning framework for medical image
+segmentation to better utilize the vast majority of imaging data that is not
+labeled or annotated. We utilize self-supervision from pixels and their local
+neighborhoods in the images themselves. Our unsupervised approach optimizes a
+training objective that leverages concepts from contrastive learning and
+autoencoding. Our framework segments medical images with a novel two-stage
+approach without relying on any labeled data at any stage. The first stage
+involves the creation of a "pixel-centered patch" that embeds every pixel along
+with its surrounding patch, using a vector representation in a high-dimensional
+latent embedding space. The second stage utilizes diffusion condensation, a
+multi-scale topological data analysis approach, to dynamically coarse-grain
+these embedding vectors at all levels of granularity. The final outcome is a
+series of coarse-to-fine segmentations that highlight image structures at
+various scales. In this work, we show successful multi-scale segmentation on
+natural images, retinal fundus images, and brain MRI images. Our framework
+delineates structures and patterns at different scales which, in the cases of
+medical images, may carry distinct information relevant to clinical
+interpretation. Quantitatively, our framework demonstrates improvements ranging
+from 10% to 200% on dice coefficient and Hausdorff distance compared to
+existing unsupervised methods across three medical image datasets. As we tackle
+the problem of segmenting medical images at multiple meaningful granularities
+without relying on any label, we hope to demonstrate the possibility to
+circumvent tedious and repetitive manual annotations in future practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Included new dataset. Ensured evaluation consistency among competing
+  methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLEDA -- Lifelong <span class="highlight-title">Self-Supervised</span> Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09027v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09027v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamatha Thota, Dewei Yi, Georgios Leontidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans and animals have the ability to continuously learn new information
+over their lifetime without losing previously acquired knowledge. However,
+artificial neural networks struggle with this due to new information
+conflicting with old knowledge, resulting in catastrophic forgetting. The
+complementary learning systems (CLS) theory suggests that the interplay between
+hippocampus and neocortex systems enables long-term and efficient learning in
+the mammalian brain, with memory replay facilitating the interaction between
+these two systems to reduce forgetting. The proposed Lifelong Self-Supervised
+Domain Adaptation (LLEDA) framework draws inspiration from the CLS theory and
+mimics the interaction between two networks: a DA network inspired by the
+hippocampus that quickly adjusts to changes in data distribution and an SSL
+network inspired by the neocortex that gradually learns domain-agnostic general
+representations. LLEDA's latent replay technique facilitates communication
+between these two networks by reactivating and replaying the past memory latent
+representations to stabilise long-term generalisation and retention without
+interfering with the previously learned information. Extensive experiments
+demonstrate that the proposed method outperforms several other methods
+resulting in a long-term adaptation while being less prone to catastrophic
+forgetting when transferred to new domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures, 6 tables; V2 added more experiments on more
+  domains and fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusing VHR Post-disaster Aerial Imagery and LiDAR Data for Roof
+  Classification in the Caribbean using CNNs <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isabelle Tingzon, Nuala Margaret Cowan, Pierre Chrzanowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and up-to-date information on building characteristics is essential
+for vulnerability assessment; however, the high costs and long timeframes
+associated with conducting traditional field surveys can be an obstacle to
+obtaining critical exposure datasets needed for disaster risk management. In
+this work, we leverage deep learning techniques for the automated
+classification of roof characteristics from very high-resolution orthophotos
+and airborne LiDAR data obtained in Dominica following Hurricane Maria in 2017.
+We demonstrate that the fusion of multimodal earth observation data performs
+better than using any single data source alone. Using our proposed methods, we
+achieve F1 scores of 0.93 and 0.92 for roof type and roof material
+classification, respectively. This work is intended to help governments produce
+more timely building information to improve resilience and disaster response in
+the Caribbean.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ICCV Humanitarian Assistance and Disaster Response Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ POAR: Towards Open Vocabulary Pedestrian Attribute Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Zhang, Suchen Wang, Shichao Kan, Zhenyu Weng, Yigang Cen, Yap-peng Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian attribute recognition (PAR) aims to predict the attributes of a
+target pedestrian in a surveillance system. Existing methods address the PAR
+problem by training a multi-label classifier with predefined attribute classes.
+However, it is impossible to exhaust all pedestrian attributes in the real
+world. To tackle this problem, we develop a novel pedestrian open-attribute
+recognition (POAR) framework. Our key idea is to formulate the POAR problem as
+an image-text search problem. We design a Transformer-based image encoder with
+a masking strategy. A set of attribute tokens are introduced to focus on
+specific pedestrian parts (e.g., head, upper body, lower body, feet, etc.) and
+encode corresponding attributes into visual embeddings. Each attribute category
+is described as a natural language sentence and encoded by the text encoder.
+Then, we compute the similarity between the visual and text embeddings of
+attributes to find the best attribute descriptions for the input images.
+Different from existing methods that learn a specific classifier for each
+attribute category, we model the pedestrian at a part-level and explore the
+searching method to handle the unseen attributes. Finally, a many-to-many
+contrastive (MTMC) loss with masked tokens is proposed to train the network
+since a pedestrian image can comprise multiple attributes. Extensive
+experiments have been conducted on benchmark PAR datasets with an
+open-attribute setting. The results verified the effectiveness of the proposed
+POAR method, which can form a strong baseline for the POAR task. Our code is
+available at \url{https://github.com/IvyYZ/POAR}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation
+  from Simulation to multiple Real-World Domains <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08083v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08083v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Gebele, Bonifaz Stuhr, Johann Haselberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation demonstrates great potential to mitigate
+domain shifts by transferring models from labeled source domains to unlabeled
+target domains. While Unsupervised Domain Adaptation has been applied to a wide
+variety of complex vision tasks, only few works focus on lane detection for
+autonomous driving. This can be attributed to the lack of publicly available
+datasets. To facilitate research in these directions, we propose CARLANE, a
+3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE
+encompasses the single-target datasets MoLane and TuLane and the multi-target
+dataset MuLane. These datasets are built from three different domains, which
+cover diverse scenes and contain a total of 163K unique images, 118K of which
+are annotated. In addition we evaluate and report systematic baselines,
+including our own method, which builds upon Prototypical Cross-domain
+Self-supervised Learning. We find that false positive and false negative rates
+of the evaluated domain adaptation methods are high compared to those of fully
+supervised baselines. This affirms the need for benchmarks such as CARLANE to
+further strengthen research in Unsupervised Domain Adaptation for lane
+detection. CARLANE, all evaluated models and the corresponding implementations
+are publicly available at https://carlanebenchmark.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Realistic Data Enrichment for Robust Image Segmentation in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Cechnicka, James Ball, Hadrien Reynaud, Callum Arthurs, Candice Roufosse, Bernhard Kainz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Poor performance of quantitative analysis in histopathological Whole Slide
+Images (WSI) has been a significant obstacle in clinical practice. Annotating
+large-scale WSIs manually is a demanding and time-consuming task, unlikely to
+yield the expected results when used for fully supervised learning systems.
+Rarely observed disease patterns and large differences in object scales are
+difficult to model through conventional patient intake. Prior methods either
+fall back to direct disease classification, which only requires learning a few
+factors per image, or report on average image segmentation performance, which
+is highly biased towards majority observations. Geometric image augmentation is
+commonly used to improve robustness for average case predictions and to enrich
+limited datasets. So far no method provided sampling of a realistic posterior
+distribution to improve stability, e.g. for the segmentation of imbalanced
+objects within images. Therefore, we propose a new approach, based on diffusion
+models, which can enrich an imbalanced dataset with plausible examples from
+underrepresented groups by conditioning on segmentation maps. Our method can
+simply expand limited clinical datasets making them suitable to train machine
+learning pipelines, and provides an interpretable and human-controllable way of
+generating histopathology images that are indistinguishable from real ones to
+human experts. We validate our findings on two datasets, one from the public
+domain and one from a Kidney Transplant study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IML-ViT: Benchmarking Image Manipulation Localization by Vision
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14863v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14863v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochen Ma, Bo Du, Zhuohang Jiang, Ahmed Y. Al Hammadi, Jizhe Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced image tampering techniques are increasingly challenging the
+trustworthiness of multimedia, leading to the development of Image Manipulation
+Localization (IML). But what makes a good IML model? The answer lies in the way
+to capture artifacts. Exploiting artifacts requires the model to extract
+non-semantic discrepancies between manipulated and authentic regions,
+necessitating explicit comparisons between the two areas. With the
+self-attention mechanism, naturally, the Transformer should be a better
+candidate to capture artifacts. However, due to limited datasets, there is
+currently no pure ViT-based approach for IML to serve as a benchmark, and CNNs
+dominate the entire task. Nevertheless, CNNs suffer from weak long-range and
+non-semantic modeling. To bridge this gap, based on the fact that artifacts are
+sensitive to image resolution, amplified under multi-scale features, and
+massive at the manipulation border, we formulate the answer to the former
+question as building a ViT with high-resolution capacity, multi-scale feature
+extraction capability, and manipulation edge supervision that could converge
+with a small amount of data. We term this simple but effective ViT paradigm
+IML-ViT, which has significant potential to become a new benchmark for IML.
+Extensive experiments on five benchmark datasets verified our model outperforms
+the state-of-the-art manipulation localization methods.Code and models are
+available at \url{https://github.com/SunnyHaze/IML-ViT}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Study on the Impact of Data Augmentation for Training Convolutional
+  Neural Networks in the Presence of Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11176v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11176v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emeson Santana, Gustavo Carneiro, Filipe R. Cordeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label noise is common in large real-world datasets, and its presence harms
+the training process of deep neural networks. Although several works have
+focused on the training strategies to address this problem, there are few
+studies that evaluate the impact of data augmentation as a design choice for
+training deep neural networks. In this work, we analyse the model robustness
+when using different data augmentations and their improvement on the training
+with the presence of noisy labels. We evaluate state-of-the-art and classical
+data augmentation strategies with different levels of synthetic noise for the
+datasets MNist, CIFAR-10, CIFAR-100, and the real-world dataset Clothing1M. We
+evaluate the methods using the accuracy metric. Results show that the
+appropriate selection of data augmentation can drastically improve the model
+robustness to label noise, increasing up to 177.84% of relative best test
+accuracy compared to the baseline with no augmentation, and an increase of up
+to 6% in absolute value with the state-of-the-art DivideMix training strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at SIBGRAPI 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abductive Action Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13984v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13984v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clement Tan, Chai Kiat Yeo, Cheston Tan, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abductive reasoning aims to make the most likely inference for a given set of
+incomplete observations. In this paper, we introduce a novel research task
+known as "abductive action inference" which addresses the question of which
+actions were executed by a human to reach a specific state shown in a single
+snapshot. The research explores three key abductive inference problems: action
+set prediction, action sequence prediction, and abductive action verification.
+To tackle these challenging tasks, we investigate various models, including
+established ones such as Transformers, Graph Neural Networks, CLIP, BLIP, GPT3,
+end-to-end trained Slow-Fast, Resnet50-3D, and ViT models. Furthermore, the
+paper introduces several innovative models tailored for abductive action
+inference, including a relational graph neural network, a relational bilinear
+pooling model, a relational rule-based inference model, a relational GPT-3
+prompt method, and a relational Transformer model. Notably, the newly proposed
+object-relational bilinear graph encoder-decoder (BiGED) model emerges as the
+most effective among all methods evaluated, demonstrating good proficiency in
+handling the intricacies of the Action Genome dataset. The contributions of
+this research offer significant progress toward comprehending the implications
+of human actions and making highly plausible inferences concerning the outcomes
+of these actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extreme Image Compression using Fine-tuned VQGAN Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Mao, Tinghan Yang, Yinuo Zhang, Shuyin Pan, Meng Wang, Shiqi Wang, Siwei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative compression methods have demonstrated
+remarkable progress in enhancing the perceptual quality of compressed data,
+especially in scenarios with low bitrates. Nevertheless, their efficacy and
+applicability in achieving extreme compression ratios ($<0.1$ bpp) still remain
+constrained. In this work, we propose a simple yet effective coding framework
+by introducing vector quantization (VQ)-based generative models into the image
+compression domain. The main insight is that the codebook learned by the VQGAN
+model yields strong expressive capacity, facilitating efficient compression of
+continuous information in the latent space while maintaining reconstruction
+quality. Specifically, an image can be represented as VQ-indices by finding the
+nearest codeword, which can be encoded using lossless compression methods into
+bitstreams. We then propose clustering a pre-trained large-scale codebook into
+smaller codebooks using the K-means algorithm. This enables images to be
+represented as diverse ranges of VQ-indices maps, resulting in variable
+bitrates and different levels of reconstruction quality. Extensive qualitative
+and quantitative experiments on various datasets demonstrate that the proposed
+framework outperforms the state-of-the-art codecs in terms of perceptual
+quality-oriented metrics and human perception under extremely low bitrates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generative Compression, Extreme Compression, VQGANs, Low Bitrate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Botet Colomer, Pier Luigi Dovesi, Theodoros Panagiotakopoulos, Joao Frederico Carvalho, Linus Härenstam-Nielsen, Hossein Azizpour, Hedvig Kjellström, Daniel Cremers, Matteo Poggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Online Domain Adaptation for semantic segmentation is to handle
+unforeseeable domain changes that occur during deployment, like sudden weather
+events. However, the high computational costs associated with brute-force
+adaptation make this paradigm unfeasible for real-world applications. In this
+paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training
+framework for real-time domain adaptation. Our approach includes a
+hardware-aware back-propagation orchestration agent (HAMT) and a dedicated
+domain-shift detector that enables active control over when and how the model
+is adapted (LT). Thanks to these advancements, our approach is capable of
+performing semantic segmentation while simultaneously adapting at more than
+29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and
+speed trade-off is demonstrated on OnDA and SHIFT benchmarks through
+experimental results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. The first two authors contributed equally. Project page:
+  https://marcbotet.github.io/hamlet-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyper-Connected <span class="highlight-title">Transformer</span> Network for Multi-Modality PET-CT
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Bi, Michael Fulham, Shaoli Song, David Dagan Feng, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  [18F]-Fluorodeoxyglucose (FDG) positron emission tomography - computed
+tomography (PET-CT) has become the imaging modality of choice for diagnosing
+many cancers. Co-learning complementary PET-CT imaging features is a
+fundamental requirement for automatic tumor segmentation and for developing
+computer aided cancer diagnosis systems. In this study, we propose a
+hyper-connected transformer (HCT) network that integrates a transformer network
+(TN) with a hyper connected fusion for multi-modality PET-CT images. The TN was
+leveraged for its ability to provide global dependencies in image feature
+learning, which was achieved by using image patch embeddings with a
+self-attention mechanism to capture image-wide contextual information. We
+extended the single-modality definition of TN with multiple TN based branches
+to separately extract image features. We also introduced a hyper connected
+fusion to fuse the contextual and complementary image features across multiple
+transformers in an iterative manner. Our results with two clinical datasets
+show that HCT achieved better performance in segmentation accuracy when
+compared to the existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMBC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MixPro: Data Augmentation with MaskMix and Progressive Attention
+  Labeling for Vision <span class="highlight-title">Transformer</span> <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qihao Zhao, Yangyu Huang, Wei Hu, Fan Zhang, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed data augmentation TransMix employs attention labels to
+help visual transformers (ViT) achieve better robustness and performance.
+However, TransMix is deficient in two aspects: 1) The image cropping method of
+TransMix may not be suitable for ViTs. 2) At the early stage of training, the
+model produces unreliable attention maps. TransMix uses unreliable attention
+maps to compute mixed attention labels that can affect the model. To address
+the aforementioned issues, we propose MaskMix and Progressive Attention
+Labeling (PAL) in image and label space, respectively. In detail, from the
+perspective of image space, we design MaskMix, which mixes two images based on
+a patch-like grid mask. In particular, the size of each mask patch is
+adjustable and is a multiple of the image patch size, which ensures each image
+patch comes from only one image and contains more global contents. From the
+perspective of label space, we design PAL, which utilizes a progressive factor
+to dynamically re-weight the attention weights of the mixed attention label.
+Finally, we combine MaskMix and Progressive Attention Labeling as our new data
+augmentation method, named MixPro. The experimental results show that our
+method can improve various ViT-based models at scales on ImageNet
+classification (73.8\% top-1 accuracy based on DeiT-T for 300 epochs). After
+being pre-trained with MixPro on ImageNet, the ViT-based models also
+demonstrate better transferability to semantic segmentation, object detection,
+and instance segmentation. Furthermore, compared to TransMix, MixPro also shows
+stronger robustness on several benchmarks. The code is available at
+https://github.com/fistyee/MixPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023, 16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Deep Generative Models with Generalized Empirical
+  Likelihoods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suman Ravuri, Mélanie Rey, Shakir Mohamed, Marc Deisenroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how well a deep generative model captures a distribution of
+high-dimensional data remains an important open challenge. It is especially
+difficult for certain model classes, such as Generative Adversarial Networks
+and Diffusion Models, whose models do not admit exact likelihoods. In this
+work, we demonstrate that generalized empirical likelihood (GEL) methods offer
+a family of diagnostic tools that can identify many deficiencies of deep
+generative models (DGMs). We show, with appropriate specification of moment
+conditions, that the proposed method can identify which modes have been
+dropped, the degree to which DGMs are mode imbalanced, and whether DGMs
+sufficiently capture intra-class diversity. We show how to combine techniques
+from Maximum Mean Discrepancy and Generalized Empirical Likelihood to create
+not only distribution tests that retain per-sample interpretability, but also
+metrics that include label information. We find that such tests predict the
+degree of mode dropping and mode imbalance up to 60% better than metrics such
+as improved precision/recall. We provide an implementation at
+https://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of
+  submissions)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SceneGATE: Scene-Graph based co-Attention networks for TExt visual
+  question answering <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08283v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08283v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiqi Cao, Siwen Luo, Felipe Nunez, Zean Wen, Josiah Poon, Caren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most TextVQA approaches focus on the integration of objects, scene texts and
+question words by a simple transformer encoder. But this fails to capture the
+semantic relations between different modalities. The paper proposes a Scene
+Graph based co-Attention Network (SceneGATE) for TextVQA, which reveals the
+semantic relations among the objects, Optical Character Recognition (OCR)
+tokens and the question words. It is achieved by a TextVQA-based scene graph
+that discovers the underlying semantics of an image. We created a
+guided-attention module to capture the intra-modal interplay between the
+language and the vision as a guidance for inter-modal interactions. To make
+explicit teaching of the relations between the two modalities, we proposed and
+integrated two attention modules, namely a scene graph-based semantic
+relation-aware attention and a positional relation-aware attention. We
+conducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.
+It is shown that our SceneGATE method outperformed existing ones because of the
+scene graph and its attention modules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Robotics (Q1, SCI indexed Journal):
+  https://www.mdpi.com/2218-6581/12/4/114</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imperceptible Physical Attack against Face Recognition Systems via LED
+  Illumination Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbin Fang, Canjian Jiang, You Jiang, Puxi Lin, Zhaojie Chen, Yujing Sun, Siu-Ming Yiu, Zoe L. Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face recognition starts to play an important role in our daily life,
+we need to pay attention that data-driven face recognition vision systems are
+vulnerable to adversarial attacks. However, the current two categories of
+adversarial attacks, namely digital attacks and physical attacks both have
+drawbacks, with the former ones impractical and the latter one conspicuous,
+high-computational and inexecutable. To address the issues, we propose a
+practical, executable, inconspicuous and low computational adversarial attack
+based on LED illumination modulation. To fool the systems, the proposed attack
+generates imperceptible luminance changes to human eyes through fast intensity
+modulation of scene LED illumination and uses the rolling shutter effect of
+CMOS image sensors in face recognition systems to implant luminance information
+perturbation to the captured face images. In summary,we present a
+denial-of-service (DoS) attack for face detection and a dodging attack for face
+verification. We also evaluate their effectiveness against well-known face
+detection models, Dlib, MTCNN and RetinaFace , and face verification models,
+Dlib, FaceNet,and ArcFace.The extensive experiments show that the success rates
+of DoS attacks against face detection models reach 97.67%, 100%, and 100%,
+respectively, and the success rates of dodging attacks against all face
+verification models reach 100%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09036v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09036v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Chen, Yu Deng, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images with both photorealism and multiview 3D consistency is
+crucial for 3D-aware GANs, yet existing methods struggle to achieve them
+simultaneously. Improving the photorealism via CNN-based 2D super-resolution
+can break the strict 3D consistency, while keeping the 3D consistency by
+learning high-resolution 3D representations for direct rendering often
+compromises image quality. In this paper, we propose a novel learning strategy,
+namely 3D-to-2D imitation, which enables a 3D-aware GAN to generate
+high-quality images while maintaining their strict 3D consistency, by letting
+the images synthesized by the generator's 3D rendering branch to mimic those
+generated by its 2D super-resolution branch. We also introduce 3D-aware
+convolutions into the generator for better 3D representation learning, which
+further improves the image generation quality. With the above strategies, our
+method reaches FID scores of 5.4 and 4.3 on FFHQ and AFHQ-v2 Cats,
+respectively, at 512x512 resolution, largely outperforming existing 3D-aware
+GANs using direct 3D rendering and coming very close to the previous
+state-of-the-art method that leverages 2D super-resolution. Project website:
+https://seanchenxy.github.io/Mimic3DWeb.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEVControl: Accurately Controlling Street-view Elements with
+  Multi-perspective Consistency via BEV Sketch Layout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01661v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01661v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kairui Yang, Enhui Ma, Jibin Peng, Qing Guo, Di Lin, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using synthesized images to boost the performance of perception models is a
+long-standing research challenge in computer vision. It becomes more eminent in
+visual-centric autonomous driving systems with multi-view cameras as some
+long-tail scenarios can never be collected. Guided by the BEV segmentation
+layouts, the existing generative networks seem to synthesize photo-realistic
+street-view images when evaluated solely on scene-level metrics. However, once
+zoom-in, they usually fail to produce accurate foreground and background
+details such as heading. To this end, we propose a two-stage generative method,
+dubbed BEVControl, that can generate accurate foreground and background
+contents. In contrast to segmentation-like input, it also supports sketch style
+input, which is more flexible for humans to edit. In addition, we propose a
+comprehensive multi-level evaluation protocol to fairly compare the quality of
+the generated scene, foreground object, and background geometry. Our extensive
+experiments show that our BEVControl surpasses the state-of-the-art method,
+BEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation
+mIoU. In addition, we show that using images generated by BEVControl to train
+the downstream perception model, it achieves on average 1.29 improvement in NDS
+score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13925v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13925v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Chen, Guoyang Xie, Jiaqi Liu, Jinbao Wang, Ziqi Luo, Jinfan Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D anomaly detection is an emerging and vital computer vision task in
+industrial manufacturing (IM). Recently many advanced algorithms have been
+published, but most of them cannot meet the needs of IM. There are several
+disadvantages: i) difficult to deploy on production lines since their
+algorithms heavily rely on large pre-trained models; ii) hugely increase
+storage overhead due to overuse of memory banks; iii) the inference speed
+cannot be achieved in real-time. To overcome these issues, we propose an easy
+and deployment-friendly network (called EasyNet) without using pre-trained
+models and memory banks: firstly, we design a multi-scale multi-modality
+feature encoder-decoder to accurately reconstruct the segmentation maps of
+anomalous regions and encourage the interaction between RGB images and depth
+images; secondly, we adopt a multi-modality anomaly segmentation network to
+achieve a precise anomaly map; thirdly, we propose an attention-based
+information entropy fusion module for feature fusion during inference, making
+it suitable for real-time deployment. Extensive experiments show that EasyNet
+achieves an anomaly detection AUROC of 92.6% without using pre-trained models
+and memory banks. In addition, EasyNet is faster than existing methods, with a
+high frame rate of 94.55 FPS on a Tesla V100 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Guided MLP-Mixer for Skeleton-Based Human Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03532v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03532v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshun Wang, Qiongjie Cui, Chen Chen, Shen Zhao, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Graph Convolutional Networks (GCNs) have been widely used in
+human motion prediction, but their performance remains unsatisfactory.
+Recently, MLP-Mixer, initially developed for vision tasks, has been leveraged
+into human motion prediction as a promising alternative to GCNs, which achieves
+both better performance and better efficiency than GCNs. Unlike GCNs, which can
+explicitly capture human skeleton's bone-joint structure by representing it as
+a graph with edges and nodes, MLP-Mixer relies on fully connected layers and
+thus cannot explicitly model such graph-like structure of human's. To break
+this limitation of MLP-Mixer's, we propose \textit{Graph-Guided Mixer}, a novel
+approach that equips the original MLP-Mixer architecture with the capability to
+model graph structure. By incorporating graph guidance, our
+\textit{Graph-Guided Mixer} can effectively capture and utilize the specific
+connectivity patterns within human skeleton's graph representation. In this
+paper, first we uncover a theoretical connection between MLP-Mixer and GCN that
+is unexplored in existing research. Building on this theoretical connection,
+next we present our proposed \textit{Graph-Guided Mixer}, explaining how the
+original MLP-Mixer architecture is reinvented to incorporate guidance from
+graph structure. Then we conduct an extensive evaluation on the Human3.6M,
+AMASS, and 3DPW datasets, which shows that our method achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Han, Yujie Zhong, Dengjie Li, Kai Han, Lin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the open-vocabulary semantic segmentation problem has attracted
+increasing attention and the best performing methods are based on two-stream
+networks: one stream for proposal mask generation and the other for segment
+classification using a pretrained visual-language model. However, existing
+two-stream methods require passing a great number of (up to a hundred) image
+crops into the visual-language model, which is highly inefficient. To address
+the problem, we propose a network that only needs a single pass through the
+visual-language model for each input image. Specifically, we first propose a
+novel network adaptation approach, termed patch severance, to restrict the
+harmful interference between the patch embeddings in the pre-trained visual
+encoder. We then propose classification anchor learning to encourage the
+network to spatially focus on more discriminative features for classification.
+Extensive experiments demonstrate that the proposed method achieves outstanding
+performance, surpassing state-of-the-art methods while being 4 to 7 times
+faster at inference. Code: https://github.com/CongHan0808/DeOP.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Benefits of 3D Pose and Tracking for Human Action Recognition <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jathushan Rajasegaran, Georgios Pavlakos, Angjoo Kanazawa, Christoph Feichtenhofer, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we study the benefits of using tracking and 3D poses for action
+recognition. To achieve this, we take the Lagrangian view on analysing actions
+over a trajectory of human motion rather than at a fixed point in space. Taking
+this stand allows us to use the tracklets of people to predict their actions.
+In this spirit, first we show the benefits of using 3D pose to infer actions,
+and study person-person interactions. Subsequently, we propose a Lagrangian
+Action Recognition model by fusing 3D pose and contextualized appearance over
+tracklets. To this end, our method achieves state-of-the-art performance on the
+AVA v2.2 dataset on both pose only settings and on standard benchmark settings.
+When reasoning about the action using only pose cues, our pose model achieves
++10.0 mAP gain over the corresponding state-of-the-art while our fused model
+has a gain of +2.8 mAP over the best state-of-the-art model. Code and results
+are available at: https://brjathu.github.io/LART
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2023 (project page: https://brjathu.github.io/LART)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making Images Real Again: A Comprehensive <span class="highlight-title">Survey</span> on Deep Image
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Niu, Wenyan Cong, Liu Liu, Yan Hong, Bo Zhang, Jing Liang, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a common image editing operation, image composition aims to combine the
+foreground from one image and another background image, resulting in a
+composite image. However, there are many issues that could make the composite
+images unrealistic. These issues can be summarized as the inconsistency between
+foreground and background, which includes appearance inconsistency (e.g.,
+incompatible illumination), geometry inconsistency (e.g., unreasonable size),
+and semantic inconsistency (e.g., mismatched semantic context). Image
+composition task could be decomposed into multiple sub-tasks, in which each
+sub-task targets at one or more issues. Specifically, object placement aims to
+find reasonable scale, location, and shape for the foreground. Image blending
+aims to address the unnatural boundary between foreground and background. Image
+harmonization aims to adjust the illumination statistics of foreground. Shadow
+generation aims to generate plausible shadow for the foreground. These
+sub-tasks can be executed sequentially or parallelly to acquire realistic
+composite images. To the best of our knowledge, there is no previous survey on
+image composition. In this paper, we conduct comprehensive survey over the
+sub-tasks and combinatorial task of image composition. For each one, we
+summarize the existing methods, available datasets, and common evaluation
+metrics. Datasets and codes for image composition are summarized at
+https://github.com/bcmi/Awesome-Image-Composition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised
+  Temporal Action Localization <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Tang, Junsong Fan, Chuanchen Luo, Zhaoxiang Zhang, Man Zhang, Zongyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised temporal action localization (WTAL) is a practical yet
+challenging task. Due to large-scale datasets, most existing methods use a
+network pretrained in other datasets to extract features, which are not
+suitable enough for WTAL. To address this problem, researchers design several
+modules for feature enhancement, which improve the performance of the
+localization module, especially modeling the temporal relationship between
+snippets. However, all of them neglect the adverse effects of ambiguous
+information, which would reduce the discriminability of others. Considering
+this phenomenon, we propose Discriminability-Driven Graph Network (DDG-Net),
+which explicitly models ambiguous snippets and discriminative snippets with
+well-designed connections, preventing the transmission of ambiguous information
+and enhancing the discriminability of snippet-level representations.
+Additionally, we propose feature consistency loss to prevent the assimilation
+of features and drive the graph convolution network to generate more
+discriminative representations. Extensive experiments on THUMOS14 and
+ActivityNet1.2 benchmarks demonstrate the effectiveness of DDG-Net,
+establishing new state-of-the-art results on both datasets. Source code is
+available at \url{https://github.com/XiaojunTang22/ICCV2023-DDGNet}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungwon Hwang, Junha Hyung, Daejin Kim, Min-Jung Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As recent advances in Neural Radiance Fields (NeRF) have enabled
+high-fidelity 3D face reconstruction and novel view synthesis, its manipulation
+also became an essential task in 3D vision. However, existing manipulation
+methods require extensive human labor, such as a user-provided semantic mask
+and manual attribute search unsuitable for non-expert users. Instead, our
+approach is designed to require a single text to manipulate a face
+reconstructed with NeRF. To do so, we first train a scene manipulator, a latent
+code-conditional deformable NeRF, over a dynamic scene to control a face
+deformation using the latent code. However, representing a scene deformation
+with a single latent code is unfavorable for compositing local deformations
+observed in different instances. As so, our proposed Position-conditional
+Anchor Compositor (PAC) learns to represent a manipulated scene with spatially
+varying latent codes. Their renderings with the scene manipulator are then
+optimized to yield high cosine similarity to a target text in CLIP embedding
+space for text-driven manipulation. To the best of our knowledge, our approach
+is the first to address the text-driven manipulation of a face reconstructed
+with NeRF. Extensive results, comparisons, and ablation studies demonstrate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I-ViT: Integer-only Quantization for Efficient Vision <span class="highlight-title">Transformer</span>
+  Inference <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.01405v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.01405v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have achieved state-of-the-art performance on
+various computer vision applications. However, these models have considerable
+storage and computational overheads, making their deployment and efficient
+inference on edge devices challenging. Quantization is a promising approach to
+reducing model complexity, and the dyadic arithmetic pipeline can allow the
+quantized models to perform efficient integer-only inference. Unfortunately,
+dyadic arithmetic is based on the homogeneity condition in convolutional neural
+networks, which is not applicable to the non-linear components in ViTs, making
+integer-only inference of ViTs an open issue. In this paper, we propose I-ViT,
+an integer-only quantization scheme for ViTs, to enable ViTs to perform the
+entire computational graph of inference with integer arithmetic and
+bit-shifting, and without any floating-point arithmetic. In I-ViT, linear
+operations (e.g., MatMul and Dense) follow the integer-only pipeline with
+dyadic arithmetic, and non-linear operations (e.g., Softmax, GELU, and
+LayerNorm) are approximated by the proposed light-weight integer-only
+arithmetic methods. More specifically, I-ViT applies the proposed Shiftmax and
+ShiftGELU, which are designed to use integer bit-shifting to approximate the
+corresponding floating-point operations. We evaluate I-ViT on various benchmark
+models and the results show that integer-only INT8 quantization achieves
+comparable (or even slightly higher) accuracy to the full-precision (FP)
+baseline. Furthermore, we utilize TVM for practical hardware deployment on the
+GPU's integer arithmetic units, achieving 3.72$\sim$4.11$\times$ inference
+speedup compared to the FP model. Code of both Pytorch and TVM is released at
+https://github.com/zkkli/I-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Multi-modal and Multi-hop Question Answering via Structured
+  Knowledge and Unified Retrieval-Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Yang, Qian Chen, Wen Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal multi-hop question answering involves answering a question by
+reasoning over multiple input sources from different modalities. Existing
+methods often retrieve evidences separately and then use a language model to
+generate an answer based on the retrieved evidences, and thus do not adequately
+connect candidates and are unable to model the interdependent relations during
+retrieval. Moreover, the pipelined approaches of retrieval and generation might
+result in poor generation performance when retrieval performance is low. To
+address these issues, we propose a Structured Knowledge and Unified
+Retrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion
+Encoder to align sources from different modalities using shared entities. It
+then uses a unified Retrieval-Generation Decoder to integrate intermediate
+retrieval results for answer generation and also adaptively determine the
+number of retrieval steps. Extensive experiments on two representative
+multi-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG
+outperforms the state-of-the-art models in both source retrieval and answer
+generation performance with fewer parameters. Our code is available at
+https://github.com/HITsz-TMG/SKURG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepQ-ViT: Scale Reparameterization for Post-Training Quantization of
+  Vision <span class="highlight-title">Transformer</span>s <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08254v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08254v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Junrui Xiao, Lianwei Yang, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ), which only requires a tiny dataset for
+calibration without end-to-end retraining, is a light and practical model
+compression technique. Recently, several PTQ schemes for vision transformers
+(ViTs) have been presented; unfortunately, they typically suffer from
+non-trivial accuracy degradation, especially in low-bit cases. In this paper,
+we propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale
+reparameterization, to address the above issues. RepQ-ViT decouples the
+quantization and inference processes, where the former employs complex
+quantizers and the latter employs scale-reparameterized simplified quantizers.
+This ensures both accurate quantization and efficient inference, which
+distinguishes it from existing approaches that sacrifice quantization
+performance to meet the target hardware. More specifically, we focus on two
+components with extreme distributions: post-LayerNorm activations with severe
+inter-channel variation and post-Softmax activations with power-law features,
+and initially apply channel-wise quantization and log$\sqrt{2}$ quantization,
+respectively. Then, we reparameterize the scales to hardware-friendly
+layer-wise quantization and log2 quantization for inference, with only slight
+accuracy or computational costs. Extensive experiments are conducted on
+multiple vision tasks with different model variants, proving that RepQ-ViT,
+without hyperparameters and expensive reconstruction procedures, can outperform
+existing strong baselines and encouragingly improve the accuracy of 4-bit PTQ
+of ViTs to a usable level. Code is available at
+https://github.com/zkkli/RepQ-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust <span class="highlight-title">Self-Supervised</span> Extrinsic Self-Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takayuki Kanai, Igor Vasiljevic, Vitor Guizilini, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles and robots need to operate over a wide variety of
+scenarios in order to complete tasks efficiently and safely. Multi-camera
+self-supervised monocular depth estimation from videos is a promising way to
+reason about the environment, as it generates metrically scaled geometric
+predictions from visual data without requiring additional sensors. However,
+most works assume well-calibrated extrinsics to fully leverage this
+multi-camera setup, even though accurate and efficient calibration is still a
+challenging problem. In this work, we introduce a novel method for extrinsic
+calibration that builds upon the principles of self-supervised monocular depth
+and ego-motion learning. Our proposed curriculum learning strategy uses
+monocular depth and pose estimators with velocity supervision to estimate
+extrinsics, and then jointly learns extrinsic calibration along with depth and
+pose for a set of overlapping cameras rigidly attached to a moving vehicle.
+Experiments on a benchmark multi-camera dataset (DDAD) demonstrate that our
+method enables self-calibration in various scenes robustly and efficiently
+compared to a traditional vision-based pose estimation pipeline. Furthermore,
+we demonstrate the benefits of extrinsics self-calibration as a way to improve
+depth prediction via joint optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sites.google.com/view/tri-sesc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Chan Lee, Daniel Rho, Jong Hwan Ko, Eunbyung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields, also known as coordinate-based or implicit neural
+representations, have shown a remarkable capability of representing,
+generating, and manipulating various forms of signals. For video
+representations, however, mapping pixel-wise coordinates to RGB colors has
+shown relatively low compression performance and slow convergence and inference
+speed. Frame-wise video representation, which maps a temporal coordinate to its
+entire frame, has recently emerged as an alternative method to represent
+videos, improving compression rates and encoding speed. While promising, it has
+still failed to reach the performance of state-of-the-art video compression
+algorithms. In this work, we propose FFNeRV, a novel method for incorporating
+flow information into frame-wise representations to exploit the temporal
+redundancy across the frames in videos inspired by the standard video codecs.
+Furthermore, we introduce a fully convolutional architecture, enabled by
+one-dimensional temporal grids, improving the continuity of spatial features.
+Experimental results show that FFNeRV yields the best performance for video
+compression and frame interpolation among the methods using frame-wise
+representations or neural fields. To reduce the model size even further, we
+devise a more compact convolutional architecture using the group and pointwise
+convolutions. With model compression techniques, including quantization-aware
+training and entropy coding, FFNeRV outperforms widely-used standard video
+codecs (H.264 and HEVC) and performs on par with state-of-the-art video
+compression algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page including code is available at
+  https://maincold2.github.io/ffnerv/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Fine-Tuning of Deep Neural Networks with Hessian-based
+  Generalization Guarantees <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02659v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02659v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Ju, Dongyue Li, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider fine-tuning a pretrained deep neural network on a target task. We
+study the generalization properties of fine-tuning to understand the problem of
+overfitting, which has often been observed (e.g., when the target dataset is
+small or when the training labels are noisy). Existing generalization measures
+for deep networks depend on notions such as distance from the initialization
+(i.e., the pretrained network) of the fine-tuned model and noise stability
+properties of deep networks. This paper identifies a Hessian-based distance
+measure through PAC-Bayesian analysis, which is shown to correlate well with
+observed generalization gaps of fine-tuned models. Theoretically, we prove
+Hessian distance-based generalization bounds for fine-tuned models. We also
+describe an extended study of fine-tuning against label noise, where
+overfitting is against a critical problem; We present an algorithm and a
+generalization error guarantee for this algorithm under a class conditional
+independent noise model. Empirically, we observe that the Hessian-based
+distance measure can match the scale of the observed generalization gap of
+fine-tuned models in practice. We also test our algorithm on several image
+classification tasks with noisy training labels, showing notable gains over
+prior methods, and the Hessian distance measure of the fine-tuned model
+decreases substantially.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages. Appeared in ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning by Erasing: Conditional Entropy based Transferable
+  Out-Of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.11041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.11041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Xing, Zhiyong Feng, Yong Su, Changjae Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is essential to handle the distribution
+shifts between training and test scenarios. For a new in-distribution (ID)
+dataset, existing methods require retraining to capture the dataset-specific
+feature representation or data distribution. In this paper, we propose a deep
+generative models (DGM) based transferable OOD detection method, which is
+unnecessary to retrain on a new ID dataset. We design an image erasing strategy
+to equip exclusive conditional entropy distribution for each ID dataset, which
+determines the discrepancy of DGM's posteriori ucertainty distribution on
+different ID datasets. Owing to the powerful representation capacity of
+convolutional neural networks, the proposed model trained on complex dataset
+can capture the above discrepancy between ID datasets without retraining and
+thus achieve transferable OOD detection. We validate the proposed method on
+five datasets and verity that ours achieves comparable performance to the
+state-of-the-art group based OOD detection methods that need to be retrained to
+deploy on new ID datasets. Our code is available at
+https://github.com/oOHCIOo/CETOOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update new experimental results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Rendering for Synthetic Aperture Radar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Wilmanski, Jonathan Tamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is rising interest in differentiable rendering, which allows explicitly
+modeling geometric priors and constraints in optimization pipelines using
+first-order methods such as backpropagation. Incorporating such domain
+knowledge can lead to deep neural networks that are trained more robustly and
+with limited data, as well as the capability to solve ill-posed inverse
+problems. Existing efforts in differentiable rendering have focused on imagery
+from electro-optical sensors, particularly conventional RGB-imagery. In this
+work, we propose an approach for differentiable rendering of Synthetic Aperture
+Radar (SAR) imagery, which combines methods from 3D computer graphics with
+neural rendering. We demonstrate the approach on the inverse graphics problem
+of 3D Object Reconstruction from limited SAR imagery using high-fidelity
+simulated SAR data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the manuscript is an updated preprint which has been
+  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but
+  has not yet been published or processed by IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Graph Filtering Network for 3D Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaedul Islam, A. Ben Hamza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) have proven to be an effective approach
+for 3D human pose estimation. By naturally modeling the skeleton structure of
+the human body as a graph, GCNs are able to capture the spatial relationships
+between joints and learn an efficient representation of the underlying pose.
+However, most GCN-based methods use a shared weight matrix, making it
+challenging to accurately capture the different and complex relationships
+between joints. In this paper, we introduce an iterative graph filtering
+framework for 3D human pose estimation, which aims to predict the 3D joint
+positions given a set of 2D joint locations in images. Our approach builds upon
+the idea of iteratively solving graph filtering with Laplacian regularization
+via the Gauss-Seidel iterative method. Motivated by this iterative solution, we
+design a Gauss-Seidel network (GS-Net) architecture, which makes use of weight
+and adjacency modulation, skip connection, and a pure convolutional block with
+layer normalization. Adjacency modulation facilitates the learning of edges
+that go beyond the inherent connections of body joints, resulting in an
+adjusted graph structure that reflects the human skeleton, while skip
+connections help maintain crucial information from the input layer's initial
+features as the network depth increases. We evaluate our proposed model on two
+standard benchmark datasets, and compare it with a comprehensive set of strong
+baseline methods for 3D human pose estimation. Our experimental results
+demonstrate that our approach outperforms the baseline methods on both
+datasets, achieving state-of-the-art performance. Furthermore, we conduct
+ablation studies to analyze the contributions of different components of our
+model architecture and show that the skip connection and adjacency modulation
+help improve the model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Contextual Data Augmentation for Generalizable Melanoma
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05116v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05116v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nick DiSanto, Gavin Harding, Ethan Martinez, Benjamin Sanders
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While skin cancer detection has been a valuable deep learning application for
+years, its evaluation has often neglected the context in which testing images
+are assessed. Traditional melanoma classifiers assume that their testing
+environments are comparable to the structured images they are trained on. This
+paper challenges this notion and argues that mole size, a critical attribute in
+professional dermatology, can be misleading in automated melanoma detection.
+While malignant melanomas tend to be larger than benign melanomas, relying
+solely on size can be unreliable and even harmful when contextual scaling of
+images is not possible. To address this issue, this implementation proposes a
+custom model that performs various data augmentation procedures to prevent
+overfitting to incorrect parameters and simulate real-world usage of melanoma
+detection applications. Multiple custom models employing different forms of
+data augmentation are implemented to highlight the most significant features of
+mole classifiers. These implementations emphasize the importance of considering
+user unpredictability when deploying such applications. The caution required
+when manually modifying data is acknowledged, as it can result in data loss and
+biased conclusions. Additionally, the significance of data augmentation in both
+the dermatology and deep learning communities is considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase Matching for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Yeqian Du, Rui Wang, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, serving as an explicit decomposition method for visual
+signals, has been employed to explain the out-of-distribution generalization
+behaviors of Convolutional Neural Networks (CNNs). Previous studies have
+indicated that the amplitude spectrum is susceptible to the disturbance caused
+by distribution shifts. On the other hand, the phase spectrum preserves
+highly-structured spatial information, which is crucial for robust visual
+representation learning. However, the spatial relationships of phase spectrum
+remain unexplored in previous researches. In this paper, we aim to clarify the
+relationships between Domain Generalization (DG) and the frequency components,
+and explore the spatial relationships of the phase spectrum. Specifically, we
+first introduce a Fourier-based structural causal model which interprets the
+phase spectrum as semi-causal factors and the amplitude spectrum as non-causal
+factors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our
+method introduces perturbations on the amplitude spectrum and establishes
+spatial relationships to match the phase components. Through experiments on
+multiple benchmarks, we demonstrate that our proposed method achieves
+state-of-the-art performance in domain generalization and out-of-distribution
+robustness tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Review</span> of YOLO: From YOLOv1 and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00501v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00501v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Terven, Diana Cordova-Esparza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  YOLO has become a central real-time object detection system for robotics,
+driverless cars, and video monitoring applications. We present a comprehensive
+analysis of YOLO's evolution, examining the innovations and contributions in
+each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with
+Transformers. We start by describing the standard metrics and postprocessing;
+then, we discuss the major changes in network architecture and training tricks
+for each model. Finally, we summarize the essential lessons from YOLO's
+development and provide a perspective on its future, highlighting potential
+research directions to enhance real-time object detection systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 19 figures, 4 tables, submitted to ACM Computing Surveys.
+  This version adds information about YOLO with transformers</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Labeling without Seeing? Blind Annotation for Privacy-Preserving Entity
+  Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiang Yao, Weizhao Jin, Srivatsan Ravi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The entity resolution problem requires finding pairs across datasets that
+belong to different owners but refer to the same entity in the real world. To
+train and evaluate solutions (either rule-based or machine-learning-based) to
+the entity resolution problem, generating a ground truth dataset with entity
+pairs or clusters is needed. However, such a data annotation process involves
+humans as domain oracles to review the plaintext data for all candidate record
+pairs from different parties, which inevitably infringes the privacy of data
+owners, especially in privacy-sensitive cases like medical records. To the best
+of our knowledge, there is no prior work on privacy-preserving ground truth
+dataset generation, especially in the domain of entity resolution. We propose a
+novel blind annotation protocol based on homomorphic encryption that allows
+domain oracles to collaboratively label ground truths without sharing data in
+plaintext with other parties. In addition, we design a domain-specific
+easy-to-use language that hides the sophisticated underlying homomorphic
+encryption layer. Rigorous proof of the privacy guarantee is provided and our
+empirical experiments via an annotation simulator indicate the feasibility of
+our privacy-preserving protocol (f-measure on average achieves more than 90\%
+compared with the real ground truths).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Graph Convolutional Network for Multimedia Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Penghang Yu, Zhiyi Tan, Guanming Lu, Bing-Kun Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommendation has received much attention in recent years. It
+models user preferences based on both behavior information and item multimodal
+information. Though current GCN-based methods achieve notable success, they
+suffer from two limitations: (1) Modality noise contamination to the item
+representations. Existing methods often mix modality features and behavior
+features in a single view (e.g., user-item view) for propagation, the noise in
+the modality features may be amplified and coupled with behavior features. In
+the end, it leads to poor feature discriminability; (2) Incomplete user
+preference modeling caused by equal treatment of modality features. Users often
+exhibit distinct modality preferences when purchasing different items. Equally
+fusing each modality feature ignores the relative importance among different
+modalities, leading to the suboptimal user preference modeling. To tackle the
+above issues, we propose a novel Multi-View Graph Convolutional Network for the
+multimedia recommendation. Specifically, to avoid modality noise contamination,
+the modality features are first purified with the aid of item behavior
+information. Then, the purified modality features of items and behavior
+features are enriched in separate views, including the user-item view and the
+item-item view. In this way, the distinguishability of features is enhanced.
+Meanwhile, a behavior-aware fuser is designed to comprehensively model user
+preferences by adaptively learning the relative importance of different
+modality features. Furthermore, we equip the fuser with a self-supervised
+auxiliary task. This task is expected to maximize the mutual information
+between the fused multimodal features and behavior features, so as to capture
+complementary and supplementary preference information simultaneously.
+Extensive experiments on three public datasets demonstrate the effectiveness of
+our methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TeraHAC: Hierarchical Agglomerative Clustering of Trillion-Edge Graphs <span class="chip">SIGMOD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laxman Dhulipala, Jason Lee, Jakub Łącki, Vahab Mirrokni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce TeraHAC, a $(1+\epsilon)$-approximate hierarchical agglomerative
+clustering (HAC) algorithm which scales to trillion-edge graphs. Our algorithm
+is based on a new approach to computing $(1+\epsilon)$-approximate HAC, which
+is a novel combination of the nearest-neighbor chain algorithm and the notion
+of $(1+\epsilon)$-approximate HAC. Our approach allows us to partition the
+graph among multiple machines and make significant progress in computing the
+clustering within each partition before any communication with other partitions
+is needed.
+  We evaluate TeraHAC on a number of real-world and synthetic graphs of up to 8
+trillion edges. We show that TeraHAC requires over 100x fewer rounds compared
+to previously known approaches for computing HAC. It is up to 8.3x faster than
+SCC, the state-of-the-art distributed algorithm for hierarchical clustering,
+while achieving 1.16x higher quality. In fact, TeraHAC essentially retains the
+quality of the celebrated HAC algorithm while significantly improving the
+running time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at SIGMOD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global cognitive graph properties dynamics of hippocampal formation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Sorokin, Andrey Zaitsew, Aleksandr Levin, German Magai, Maxim Beketov, Vladimir Sotskov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the present study we have used a set of methods and metrics to build a
+graph of relative neural connections in a hippocampus of a rodent. A set of
+graphs was built on top of time-sequenced data and analyzed in terms of
+dynamics of a connection genesis. The analysis has shown that during the
+process of a rodent exploring a novel environment, the relations between
+neurons constantly change which indicates that globally memory is constantly
+updated even for known areas of space. Even if some neurons gain cognitive
+specialization, the global network though remains relatively stable.
+Additionally we suggest a set of methods for building a graph of cognitive
+neural network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, paper for DAMDID 2023 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Consistency Learning for Cold-Start Item
+  Recommendation <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taichi Liu, Chen Gao, Zhenyu Wang, Dong Li, Jianye Hao, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Network (GNN)-based models have become the mainstream approach
+for recommender systems. Despite the effectiveness, they are still suffering
+from the cold-start problem, i.e., recommend for few-interaction items.
+Existing GNN-based recommendation models to address the cold-start problem
+mainly focus on utilizing auxiliary features of users and items, leaving the
+user-item interactions under-utilized. However, embeddings distributions of
+cold and warm items are still largely different, since cold items' embeddings
+are learned from lower-popularity interactions, while warm items' embeddings
+are from higher-popularity interactions. Thus, there is a seesaw phenomenon,
+where the recommendation performance for the cold and warm items cannot be
+improved simultaneously. To this end, we proposed a Uncertainty-aware
+Consistency learning framework for Cold-start item recommendation (shorten as
+UCC) solely based on user-item interactions. Under this framework, we train the
+teacher model (generator) and student model (recommender) with consistency
+learning, to ensure the cold items with additionally generated low-uncertainty
+interactions can have similar distribution with the warm items. Therefore, the
+proposed framework improves the recommendation of cold and warm items at the
+same time, without hurting any one of them. Extensive experiments on benchmark
+datasets demonstrate that our proposed method significantly outperforms
+state-of-the-art methods on both warm and cold items, with an average
+performance improvement of 27.6%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Contrastive Learning with Multiple Augmentation for
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongjun Lee, Donggeun Ko, Jaekwang Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation addresses the issue of preference drift by
+predicting the next item based on the user's previous behaviors. Recently, a
+promising approach using contrastive learning has emerged, demonstrating its
+effectiveness in recommending items under sparse user-item interactions.
+Significantly, the effectiveness of combinations of various augmentation
+methods has been demonstrated in different domains, particularly in computer
+vision. However, when it comes to augmentation within a contrastive learning
+framework in sequential recommendation, previous research has only focused on
+limited conditions and simple structures. Thus, it is still possible to extend
+existing approaches to boost the effects of augmentation methods by using
+progressed structures with the combinations of multiple augmentation methods.
+In this work, we propose a novel framework called Hierarchical Contrastive
+Learning with Multiple Augmentation for Sequential Recommendation(HCLRec) to
+overcome the aforementioned limitation. Our framework leverages existing
+augmentation methods hierarchically to improve performance. By combining
+augmentation methods continuously, we generate low-level and high-level view
+pairs. We employ a Transformers-based model to encode the input sequence
+effectively. Furthermore, we introduce additional blocks consisting of
+Transformers and position-wise feed-forward network(PFFN) layers to learn the
+invariance of the original sequences from hierarchically augmented views. We
+pass the input sequence to subsequent layers based on the number of increment
+levels applied to the views to handle various augmentation levels. Within each
+layer, we compute contrastive loss between pairs of views at the same level.
+Extensive experiments demonstrate that our proposed method outperforms
+state-of-the-art approaches and that HCLRec is robust even when faced with the
+problem of sparse interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ POSIT: Promotion of Semantic Item Tail via Adversarial Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuling Xu, Pannaga Shivaswamy, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many recommender problems, a handful of popular items (e.g. movies/TV
+shows, news etc.) can be dominant in recommendations for many users. However,
+we know that in a large catalog of items, users are likely interested in more
+than what is popular. The dominance of popular items may mean that users will
+not see items they would likely enjoy. In this paper, we propose a technique to
+overcome this problem using adversarial machine learning. We define a metric to
+translate user-level utility metric in terms of an advantage/disadvantage over
+items. We subsequently use that metric in an adversarial learning framework to
+systematically promote disadvantaged items. The resulting algorithm identifies
+semantically meaningful items that get promoted in the learning algorithm. In
+the empirical study, we evaluate the proposed technique on three publicly
+available datasets and four competitive baselines. The result shows that our
+proposed method not only improves the coverage, but also, surprisingly,
+improves the overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Knowledge Fusion: A Novel Approach for Personalized
+  Recommendation via LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Yin, Junjie Xie, Yu Qin, Zixiang Ding, Zhichao Feng, Xiang Li, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis and mining of user heterogeneous behavior are of paramount
+importance in recommendation systems. However, the conventional approach of
+incorporating various types of heterogeneous behavior into recommendation
+models leads to feature sparsity and knowledge fragmentation issues. To address
+this challenge, we propose a novel approach for personalized recommendation via
+Large Language Model (LLM), by extracting and fusing heterogeneous knowledge
+from user heterogeneous behavior information. In addition, by combining
+heterogeneous knowledge and recommendation tasks, instruction tuning is
+performed on LLM for personalized recommendations. The experimental results
+demonstrate that our method can effectively integrate user heterogeneous
+behavior and significantly improve recommendation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mobile Supply: The Last Piece of Jigsaw of Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhao Jiang, Biao Zeng, Hao Feng, Jin Liu, Jie Zhang, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system is a fundamental functionality of online platforms.
+With the development of computing power of mobile phones, some researchers have
+deployed recommendation algorithms on users' devices to solve the problems of
+data transmission delay and pagination mechanism. However, the existing
+edge-side mobile rankings cannot completely solve the problem of pagination
+mechanism. The mobile rankings can only sort the items on the current page, so
+it will not work if it is called once or twice. Besides, after the user has
+viewed the items of interest to the user on the current page, the user refresh
+to get a new page of items. This will make the mobile ranking model do a lot of
+useless work and affect the user's immersive experience. In order to solve the
+pagination mechanism problem, we propose a completely new module in the
+pipeline of recommender named Mobile Supply. The pipeline of recommender system
+is extended to "retrival->pre-ranking->ranking->re-ranking->Mobile
+Supply->mobile ranking". Specifically, we introduce the concept of list value
+and use point-wise method to approximate list-wise estimation. We also design a
+new mobile ranking named device-aware mobile ranking considering the difference
+of mobile devices tailored to the new pipeline. Extensive offline and online
+experiments show the superiority of our proposed method and prove that Mobile
+Supply can further improve the performance of edge-side recommender system and
+user experience. Mobile Supply has been deployed on the homepage page of a
+large-scale online food platform and has yielded considerable profits in our
+business.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search Engine and Recommendation System for the Music Industry built
+  with JinaAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishita Gopalakrishnan, Sanjjushri Varshini R, Ponshriharini V
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most intriguing debates regarding a novel task is the development
+of search engines and recommendation-based systems in the music industry.
+Studies have shown a drastic depression in the search engine fields, due to
+concerning factors such as speed, accuracy and the format of data given for
+querying. Often people face difficulty in searching for a song solely based on
+the title, hence a solution is proposed to complete a search analysis through a
+single query input and is matched with the lyrics of the songs present in the
+database. Hence it is essential to incorporate cutting-edge technology tools
+for developing a user-friendly search engine. Jina AI is an MLOps framework for
+building neural search engines that are utilized, in order for the user to
+obtain accurate results. Jina AI effectively helps to maintain and enhance the
+quality of performance for the search engine for the query given. An effective
+search engine and a recommendation system for the music industry, built with
+JinaAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">148</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Copycat Perceptron: Smashing Barriers Through Collective Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Catania, Aurélien Decelle, Beatriz Seoane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We characterize the equilibrium properties of a model of $y$ coupled binary
+perceptrons in the teacher-student scenario, subject to a suitable learning
+rule, with an explicit ferromagnetic coupling proportional to the Hamming
+distance between the students' weights. In contrast to recent works, we analyze
+a more general setting in which a thermal noise is present that affects the
+generalization performance of each student. Specifically, in the presence of a
+nonzero temperature, which assigns nonzero probability to configurations that
+misclassify samples with respect to the teacher's prescription, we find that
+the coupling of replicas leads to a shift of the phase diagram to smaller
+values of $\alpha$: This suggests that the free energy landscape gets smoother
+around the solution with good generalization (i.e., the teacher) at a fixed
+fraction of reviewed examples, which allows local update algorithms such as
+Simulated Annealing to reach the solution before the dynamics gets frozen.
+Finally, from a learning perspective, these results suggest that more students
+(in this case, with the same amount of data) are able to learn the same rule
+when coupled together with a smaller amount of data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurvBeX: An explanation method of the machine learning survival models
+  based on the Beran estimator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lev V. Utkin, Danila Y. Eremenko, Andrei V. Konstantinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An explanation method called SurvBeX is proposed to interpret predictions of
+the machine learning survival black-box models. The main idea behind the method
+is to use the modified Beran estimator as the surrogate explanation model.
+Coefficients, incorporated into Beran estimator, can be regarded as values of
+the feature impacts on the black-box model prediction. Following the well-known
+LIME method, many points are generated in a local area around an example of
+interest. For every generated example, the survival function of the black-box
+model is computed, and the survival function of the surrogate model (the Beran
+estimator) is constructed as a function of the explanation coefficients. In
+order to find the explanation coefficients, it is proposed to minimize the mean
+distance between the survival functions of the black-box model and the Beran
+estimator produced by the generated examples. Many numerical experiments with
+synthetic and real survival data demonstrate the SurvBeX efficiency and compare
+the method with the well-known method SurvLIME. The method is also compared
+with the method SurvSHAP. The code implementing SurvBeX is available at:
+https://github.com/DanilaEremenko/SurvBeX
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dimensionality Reduction for Improving Out-of-Distribution Detection in
+  Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        McKell Woodland, Nihil Patel, Mais Al Taie, Joshua P. Yung, Tucker J. Netherton, Ankit B. Patel, Kristy K. Brock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinically deployed segmentation models are known to fail on data outside of
+their training distribution. As these models perform well on most cases, it is
+imperative to detect out-of-distribution (OOD) images at inference to protect
+against automation bias. This work applies the Mahalanobis distance post hoc to
+the bottleneck features of a Swin UNETR model that segments the liver on
+T1-weighted magnetic resonance imaging. By reducing the dimensions of the
+bottleneck features with principal component analysis, OOD images were detected
+with high performance and minimal computational load.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections. The Version of Record of this contribution will
+  be published in the Proceedings of Uncertainty for Safe Utilization of
+  Machine Learning in Medical Imaging (5th International Workshop) - Held in
+  conjunction with MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Framework for Distributed Image Semantic
+  Wireless Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingyan Xie, Yongpeng Wu, Yuxuan Shi, Derrick Wing Kwan Ng, Wenjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-node communication, which refers to the interaction among multiple
+devices, has attracted lots of attention in many Internet-of-Things (IoT)
+scenarios. However, its huge amounts of data flows and inflexibility for task
+extension have triggered the urgent requirement of communication-efficient
+distributed data transmission frameworks. In this paper, inspired by the great
+superiorities on bandwidth reduction and task adaptation of semantic
+communications, we propose a federated learning-based semantic communication
+(FLSC) framework for multi-task distributed image transmission with IoT
+devices. Federated learning enables the design of independent semantic
+communication link of each user while further improves the semantic extraction
+and task performance through global aggregation. Each link in FLSC is composed
+of a hierarchical vision transformer (HVT)-based extractor and a task-adaptive
+translator for coarse-to-fine semantic extraction and meaning translation
+according to specific tasks. In order to extend the FLSC into more realistic
+conditions, we design a channel state information-based multiple-input
+multiple-output transmission module to combat channel fading and noise.
+Simulation results show that the coarse semantic information can deal with a
+range of image-level tasks. Moreover, especially in low signal-to-noise ratio
+and channel bandwidth ratio regimes, FLSC evidently outperforms the traditional
+scheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel
+condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach human-level
+accuracy on ImageNet. Human-level competence is thus achievable for a
+fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 tables; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeRisk: An Effective Deep Learning Framework for Credit Risk Prediction
+  over Real-World Financial Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yancheng Liang, Jiajie Zhang, Hui Li, Xiaochen Liu, Yi Hu, Yong Wu, Jinyao Zhang, Yongyan Liu, Yi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous advances achieved over the past years by deep learning
+techniques, the latest risk prediction models for industrial applications still
+rely on highly handtuned stage-wised statistical learning tools, such as
+gradient boosting and random forest methods. Different from images or
+languages, real-world financial data are high-dimensional, sparse, noisy and
+extremely imbalanced, which makes deep neural network models particularly
+challenging to train and fragile in practice. In this work, we propose DeRisk,
+an effective deep learning risk prediction framework for credit risk prediction
+on real-world financial data. DeRisk is the first deep risk prediction model
+that outperforms statistical learning approaches deployed in our company's
+production system. We also perform extensive ablation studies on our method to
+present the most critical factors for the empirical success of DeRisk.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBench: Evaluating LLMs as Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming increasingly smart and autonomous,
+targeting real-world pragmatic missions beyond traditional NLP tasks. As a
+result, there has been an urgent need to evaluate LLMs as agents on challenging
+tasks in interactive environments. We present AgentBench, a multi-dimensional
+evolving benchmark that currently consists of 8 distinct environments to assess
+LLM-as-Agent's reasoning and decision-making abilities in a multi-turn
+open-ended generation setting. Our extensive test over 25 LLMs (including APIs
+and open-sourced models) shows that, while top commercial LLMs present a strong
+ability of acting as agents in complex environments, there is a significant
+disparity in performance between them and open-sourced competitors. It also
+serves as a component of an ongoing project with wider coverage and deeper
+consideration towards systematic LLM evaluation. Datasets, environments, and an
+integrated evaluation package for AgentBench are released at
+https://github.com/THUDM/AgentBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Almost-sure convergence of iterates and multipliers in stochastic
+  sequential quadratic optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank E. Curtis, Xin Jiang, Qi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic sequential quadratic optimization (SQP) methods for solving
+continuous optimization problems with nonlinear equality constraints have
+attracted attention recently, such as for solving large-scale data-fitting
+problems subject to nonconvex constraints. However, for a recently proposed
+subclass of such methods that is built on the popular stochastic-gradient
+methodology from the unconstrained setting, convergence guarantees have been
+limited to the asymptotic convergence of the expected value of a stationarity
+measure to zero. This is in contrast to the unconstrained setting in which
+almost-sure convergence guarantees (of the gradient of the objective to zero)
+can be proved for stochastic-gradient-based methods. In this paper, new
+almost-sure convergence guarantees for the primal iterates, Lagrange
+multipliers, and stationarity measures generated by a stochastic SQP algorithm
+in this subclass of methods are proved. It is shown that the error in the
+Lagrange multipliers can be bounded by the distance of the primal iterate to a
+primal stationary point plus the error in the latest stochastic gradient
+estimate. It is further shown that, subject to certain assumptions, this latter
+error can be made to vanish by employing a running average of the Lagrange
+multipliers that are computed during the run of the algorithm. The results of
+numerical experiments are provided to demonstrate the proved theoretical
+guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Convergence Bounds for Diffusion Models via Stochastic
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Benton, Valentin De Bortoli, Arnaud Doucet, George Deligiannidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a powerful method for generating approximate samples
+from high-dimensional data distributions. Several recent results have provided
+polynomial bounds on the convergence rate of such models, assuming
+$L^2$-accurate score estimators. However, up until now the best known such
+bounds were either superlinear in the data dimension or required strong
+smoothness assumptions. We provide the first convergence bounds which are
+linear in the data dimension (up to logarithmic factors) assuming only finite
+second moments of the data distribution. We show that diffusion models require
+at most $\tilde O(\frac{d \log^2(1/\delta)}{\varepsilon^2})$ steps to
+approximate an arbitrary data distribution on $\mathbb{R}^d$ corrupted with
+Gaussian noise of variance $\delta$ to within $\varepsilon^2$ in
+Kullback--Leibler divergence. Our proof builds on the Girsanov-based methods of
+previous works. We introduce a refined treatment of the error arising from the
+discretization of the reverse SDE, which is based on tools from stochastic
+localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving FHB Screening in Wheat Breeding Using an Efficient <span class="highlight-title">Transformer</span>
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Babak Azad, Ahmed Abdalla, Kwanghee Won, Ali Mirzakhani Nafchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fusarium head blight is a devastating disease that causes significant
+economic losses annually on small grains. Efficiency, accuracy, and timely
+detection of FHB in the resistance screening are critical for wheat and barley
+breeding programs. In recent years, various image processing techniques have
+been developed using supervised machine learning algorithms for the early
+detection of FHB. The state-of-the-art convolutional neural network-based
+methods, such as U-Net, employ a series of encoding blocks to create a local
+representation and a series of decoding blocks to capture the semantic
+relations. However, these methods are not often capable of long-range modeling
+dependencies inside the input data, and their ability to model multi-scale
+objects with significant variations in texture and shape is limited. Vision
+transformers as alternative architectures with innate global self-attention
+mechanisms for sequence-to-sequence prediction, due to insufficient low-level
+details, may also limit localization capabilities. To overcome these
+limitations, a new Context Bridge is proposed to integrate the local
+representation capability of the U-Net network in the transformer model. In
+addition, the standard attention mechanism of the original transformer is
+replaced with Efficient Self-attention, which is less complicated than other
+state-of-the-art methods. To train the proposed network, 12,000 wheat images
+from an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were
+captured. In addition to healthy and unhealthy plants, these images encompass
+various stages of the disease. A team of expert pathologists annotated the
+images for training and evaluating the developed model. As a result, the
+effectiveness of the transformer-based method for FHB-disease detection,
+through extensive experiments across typical tasks for plant image
+segmentation, is demonstrated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual
+  International Meeting conference in Omaha, Nebraska. Also available at
+  https://elibrary.asabe.org/abstract.asp?aid=54149</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Model in Causal Inference with Unmeasured Confounders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how to extend the use of the diffusion model to answer the causal
+question from the observational data under the existence of unmeasured
+confounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to
+capture the causal intervention, a Diffusion-based Causal Model (DCM) was
+proposed incorporating the diffusion model to answer the causal questions more
+accurately, assuming that all of the confounders are observed. However,
+unmeasured confounders in practice exist, which hinders DCM from being
+applicable. To alleviate this limitation of DCM, we propose an extended model
+called Backdoor Criterion based DCM (BDCM), whose idea is rooted in the
+Backdoor criterion to find the variables in DAG to be included in the decoding
+process of the diffusion model so that we can extend DCM to the case with
+unmeasured confounders. Synthetic data experiment demonstrates that our
+proposed model captures the counterfactual distribution more precisely than DCM
+under the unmeasured confounders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Trustworthiness and Open-World Learning: An Exploratory Neural
+  Approach for Enhancing Interpretability, Generalization, and Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shide Du, Zihan Fang, Shiyang Lan, Yanchao Tan, Manuel Günther, Shiping Wang, Wenzhong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As researchers strive to narrow the gap between machine intelligence and
+human through the development of artificial intelligence technologies, it is
+imperative that we recognize the critical importance of trustworthiness in
+open-world, which has become ubiquitous in all aspects of daily life for
+everyone. However, several challenges may create a crisis of trust in current
+artificial intelligence systems that need to be bridged: 1) Insufficient
+explanation of predictive results; 2) Inadequate generalization for learning
+models; 3) Poor adaptability to uncertain environments. Consequently, we
+explore a neural program to bridge trustworthiness and open-world learning,
+extending from single-modal to multi-modal scenarios for readers. 1) To enhance
+design-level interpretability, we first customize trustworthy networks with
+specific physical meanings; 2) We then design environmental well-being
+task-interfaces via flexible learning regularizers for improving the
+generalization of trustworthy learning; 3) We propose to increase the
+robustness of trustworthy learning by integrating open-world recognition losses
+with agent mechanisms. Eventually, we enhance various trustworthy properties
+through the establishment of design-level explainability, environmental
+well-being task-interfaces and open-world recognition programs. These designed
+open-world protocols are applicable across a wide range of surroundings, under
+open-world multimedia recognition scenarios with significant performance
+improvements observed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two-stage Early Prediction Framework of Remaining Useful Life for
+  Lithium-ion Batteries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Mittal, Hymalai Bello, Bo Zhou, Mayank Shekhar Jha, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of remaining useful life (RUL) is crucial for effective
+battery management across various industries, ranging from household appliances
+to large-scale applications. Accurate RUL prediction improves the reliability
+and maintainability of battery technology. However, existing methods have
+limitations, including assumptions of data from the same sensors or
+distribution, foreknowledge of the end of life (EOL), and neglect to determine
+the first prediction cycle (FPC) to identify the start of the unhealthy stage.
+This paper proposes a novel method for RUL prediction of Lithium-ion batteries.
+The proposed framework comprises two stages: determining the FPC using a neural
+network-based model to divide the degradation data into distinct health states
+and predicting the degradation pattern after the FPC to estimate the remaining
+useful life as a percentage. Experimental results demonstrate that the proposed
+method outperforms conventional approaches in terms of RUL prediction.
+Furthermore, the proposed method shows promise for real-world scenarios,
+providing improved accuracy and applicability for battery management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 49th Annual Conference of the IEEE Industrial
+  Electronics Society (IECON 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matrix Completion in Almost-Verification Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan A. Kelner, Jerry Li, Allen Liu, Aaron Sidford, Kevin Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We give a new framework for solving the fundamental problem of low-rank
+matrix completion, i.e., approximating a rank-$r$ matrix $\mathbf{M} \in
+\mathbb{R}^{m \times n}$ (where $m \ge n$) from random observations. First, we
+provide an algorithm which completes $\mathbf{M}$ on $99\%$ of rows and columns
+under no further assumptions on $\mathbf{M}$ from $\approx mr$ samples and
+using $\approx mr^2$ time. Then, assuming the row and column spans of
+$\mathbf{M}$ satisfy additional regularity properties, we show how to boost
+this partial completion guarantee to a full matrix completion algorithm by
+aggregating solutions to regression problems involving the observations.
+  In the well-studied setting where $\mathbf{M}$ has incoherent row and column
+spans, our algorithms complete $\mathbf{M}$ to high precision from
+$mr^{2+o(1)}$ observations in $mr^{3 + o(1)}$ time (omitting logarithmic
+factors in problem parameters), improving upon the prior state-of-the-art
+[JN15] which used $\approx mr^5$ samples and $\approx mr^7$ time. Under an
+assumption on the row and column spans of $\mathbf{M}$ we introduce (which is
+satisfied by random subspaces with high probability), our sample complexity
+improves to an almost information-theoretically optimal $mr^{1 + o(1)}$, and
+our runtime improves to $mr^{2 + o(1)}$. Our runtimes have the appealing
+property of matching the best known runtime to verify that a rank-$r$
+decomposition $\mathbf{U}\mathbf{V}^\top$ agrees with the sampled observations.
+We also provide robust variants of our algorithms that, given random
+observations from $\mathbf{M} + \mathbf{N}$ with $\|\mathbf{N}\|_{F} \le
+\Delta$, complete $\mathbf{M}$ to Frobenius norm distance $\approx
+r^{1.5}\Delta$ in the same runtimes as the noiseless setting. Prior noisy
+matrix completion algorithms [CP10] only guaranteed a distance of $\approx
+\sqrt{n}\Delta$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FOCS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Nock, Mathieu Guillame-Bert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data represents one of the most prevalent form of data. When it comes
+to data generation, many approaches would learn a density for the data
+generation process, but would not necessarily end up with a sampler, even less
+so being exact with respect to the underlying density. A second issue is on
+models: while complex modeling based on neural nets thrives in image or text
+generation (etc.), less is known for powerful generative models on tabular
+data. A third problem is the visible chasm on tabular data between training
+algorithms for supervised learning with remarkable properties (e.g. boosting),
+and a comparative lack of guarantees when it comes to data generation. In this
+paper, we tackle the three problems, introducing new tree-based generative
+models convenient for density modeling and tabular data generation that improve
+on modeling capabilities of recent proposals, and a training algorithm which
+simplifies the training setting of previous approaches and displays
+boosting-compliant convergence. This algorithm has the convenient property to
+rely on a supervised training scheme that can be implemented by a few tweaks to
+the most popular induction scheme for decision tree induction with two classes.
+Experiments are provided on missing data imputation and comparing generated
+data to real data, displaying the quality of the results obtained by our
+approach, in particular against state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous
+  Labels <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengming Lin, Yan Xia, Nishant Ravikumar, Qiongyao Liu, Michael MacRaild, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of brain vessels is crucial for cerebrovascular disease
+diagnosis and treatment. However, existing methods face challenges in capturing
+small vessels and handling datasets that are partially or ambiguously
+annotated. In this paper, we propose an adaptive semi-supervised approach to
+address these challenges. Our approach incorporates innovative techniques
+including progressive semi-supervised learning, adaptative training strategy,
+and boundary enhancement. Experimental results on 3DRA datasets demonstrate the
+superiority of our method in terms of mesh-based segmentation metrics. By
+leveraging the partially and ambiguously labeled data, which only annotates the
+main vessels, our method achieves impressive segmentation performance on
+mislabeled fine vessels, showcasing its potential for clinical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by DALI MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Early Stopping in Evolutionary Direct Policy Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etor Arza, Leni K. Le Goff, Emma Hart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lengthy evaluation times are common in many optimization problems such as
+direct policy search tasks, especially when they involve conducting evaluations
+in the physical world, e.g. in robotics applications. Often, when evaluating a
+solution over a fixed time period, it becomes clear that the objective value
+will not increase with additional computation time (for example, when a
+two-wheeled robot continuously spins on the spot). In such cases, it makes
+sense to stop the evaluation early to save computation time. However, most
+approaches to stop the evaluation are problem-specific and need to be
+specifically designed for the task at hand. Therefore, we propose an early
+stopping method for direct policy search. The proposed method only looks at the
+objective value at each time step and requires no problem-specific knowledge.
+  We test the introduced stopping criterion in five direct policy search
+environments drawn from games, robotics, and classic control domains, and show
+that it can save up to 75% of the computation time. We also compare it with
+problem-specific stopping criteria and demonstrate that it performs comparably
+while being more generally applicable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Federated Learning meets Watermarking: A Comprehensive <span class="highlight-title">Overview</span> of
+  Techniques for Intellectual Property Protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Lansari, Reda Bellafqira, Katarzyna Kapusta, Vincent Thouvenot, Olivier Bettan, Gouenou Coatrieux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a technique that allows multiple participants to
+collaboratively train a Deep Neural Network (DNN) without the need of
+centralizing their data. Among other advantages, it comes with
+privacy-preserving properties making it attractive for application in sensitive
+contexts, such as health care or the military. Although the data are not
+explicitly exchanged, the training procedure requires sharing information about
+participants' models. This makes the individual models vulnerable to theft or
+unauthorized distribution by malicious actors. To address the issue of
+ownership rights protection in the context of Machine Learning (ML), DNN
+Watermarking methods have been developed during the last five years. Most
+existing works have focused on watermarking in a centralized manner, but only a
+few methods have been designed for FL and its unique constraints. In this
+paper, we provide an overview of recent advancements in Federated Learning
+watermarking, shedding light on the new challenges and opportunities that arise
+in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2figures, 14pages, 3tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Efficient Learning in Partially Observable Contextual Bandit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueping Gong, Jiheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate transfer learning in partially observable
+contextual bandits, where agents have limited knowledge from other agents and
+partial information about hidden confounders. We first convert the problem to
+identifying or partially identifying causal effects between actions and rewards
+through optimization problems. To solve these optimization problems, we
+discretize the original functional constraints of unknown distributions into
+linear constraints, and sample compatible causal models via sequentially
+solving linear programmings to obtain causal bounds with the consideration of
+estimation error. Our sampling algorithms provide desirable convergence results
+for suitable sampling distributions. We then show how causal bounds can be
+applied to improving classical bandit algorithms and affect the regrets with
+respect to the size of action sets and function spaces. Notably, in the task
+with function approximation which allows us to handle general context
+distributions, our method improves the order dependence on function space size
+compared with previous literatures. We formally prove that our causally
+enhanced algorithms outperform classical bandit algorithms and achieve orders
+of magnitude faster convergence rates. Finally, we perform simulations that
+demonstrate the efficiency of our strategy compared to the current
+state-of-the-art methods. This research has the potential to enhance the
+performance of contextual bandit agents in real-world applications where data
+is scarce and costly to obtain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2010.03104 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Partial identification of kernel based two sample tests with mismeasured
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Nafshi, Maggie Makar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonparametric two-sample tests such as the Maximum Mean Discrepancy (MMD) are
+often used to detect differences between two distributions in machine learning
+applications. However, the majority of existing literature assumes that
+error-free samples from the two distributions of interest are available.We
+relax this assumption and study the estimation of the MMD under
+$\epsilon$-contamination, where a possibly non-random $\epsilon$ proportion of
+one distribution is erroneously grouped with the other. We show that under
+$\epsilon$-contamination, the typical estimate of the MMD is unreliable.
+Instead, we study partial identification of the MMD, and characterize sharp
+upper and lower bounds that contain the true, unknown MMD. We propose a method
+to estimate these bounds, and show that it gives estimates that converge to the
+sharpest possible bounds on the MMD as sample size increases, with a
+convergence rate that is faster than alternative approaches. Using three
+datasets, we empirically validate that our approach is superior to the
+alternatives: it gives tight bounds with a low false coverage rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Transfer Learning Framework for Proactive Ramp Metering Performance
+  Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobo Ma, Adrian Cottam, Mohammad Razaur Rahman Shaon, Yao-Jan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transportation agencies need to assess ramp metering performance when
+deploying or expanding a ramp metering system. The evaluation of a ramp
+metering strategy is primarily centered around examining its impact on freeway
+traffic mobility. One way these effects can be explored is by comparing traffic
+states, such as the speed before and after the ramp metering strategy has been
+altered. Predicting freeway traffic states for the after scenarios following
+the implementation of a new ramp metering control strategy could offer valuable
+insights into the potential effectiveness of the target strategy. However, the
+use of machine learning methods in predicting the freeway traffic state for the
+after scenarios and evaluating the effectiveness of transportation policies or
+traffic control strategies such as ramp metering is somewhat limited in the
+current literature. To bridge the research gap, this study presents a framework
+for predicting freeway traffic parameters (speed, occupancy, and flow rate) for
+the after situations when a new ramp metering control strategy is implemented.
+By learning the association between the spatial-temporal features of traffic
+states in before and after situations for known freeway segments, the proposed
+framework can transfer this learning to predict the traffic parameters for new
+freeway segments. The proposed framework is built upon a transfer learning
+model. Experimental results show that the proposed framework is feasible for
+use as an alternative for predicting freeway traffic parameters to proactively
+evaluate ramp metering performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-ramp and Off-ramp Traffic Flows Estimation Based on A Data-driven
+  Transfer Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobo Ma, Abolfazl Karimpour, Yao-Jan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To develop the most appropriate control strategy and monitor, maintain, and
+evaluate the traffic performance of the freeway weaving areas, state and local
+Departments of Transportation need to have access to traffic flows at each pair
+of on-ramp and off-ramp. However, ramp flows are not always readily available
+to transportation agencies and little effort has been made to estimate these
+missing flows in locations where no physical sensors are installed. To bridge
+this research gap, a data-driven framework is proposed that can accurately
+estimate the missing ramp flows by solely using data collected from loop
+detectors on freeway mainlines. The proposed framework employs a transfer
+learning model. The transfer learning model relaxes the assumption that the
+underlying data distributions of the source and target domains must be the
+same. Therefore, the proposed framework can guarantee high-accuracy estimation
+of on-ramp and off-ramp flows on freeways with different traffic patterns,
+distributions, and characteristics. Based on the experimental results, the flow
+estimation mean absolute errors range between 23.90 veh/h to 40.85 veh/h for
+on-ramps, and 31.58 veh/h to 45.31 veh/h for off-ramps; the flow estimation
+root mean square errors range between 34.55 veh/h to 57.77 veh/h for on-ramps,
+and 41.75 veh/h to 58.80 veh/h for off-ramps. Further, the comparison analysis
+shows that the proposed framework outperforms other conventional machine
+learning models. The estimated ramp flows based on the proposed method can help
+transportation agencies to enhance the operations of their ramp control
+strategies for locations where physical sensors are not installed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Feature Learning for Wireless Spectrum Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ljupcho Milosheski, Gregor Cerar, Blaž Bertalanič, Carolina Fortuna, Mihael Mohorčič
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the traditional feature engineering process for training
+machine learning models is being automated by the feature extraction layers
+integrated in deep learning architectures. In wireless networks, many studies
+were conducted in automatic learning of feature representations for
+domain-related challenges. However, most of the existing works assume some
+supervision along the learning process by using labels to optimize the model.
+In this paper, we investigate an approach to learning feature representations
+for wireless transmission clustering in a completely unsupervised manner, i.e.
+requiring no labels in the process. We propose a model based on convolutional
+neural networks that automatically learns a reduced dimensionality
+representation of the input data with 99.3% less components compared to a
+baseline principal component analysis (PCA). We show that the automatic
+representation learning is able to extract fine-grained clusters containing the
+shapes of the wireless transmission bursts, while the baseline enables only
+general separability of the data based on the background noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michaël Mathieu, Sherjil Ozair, Srivatsan Srinivasan, Caglar Gulcehre, Shangtong Zhang, Ray Jiang, Tom Le Paine, Richard Powell, Konrad Żołna, Julian Schrittwieser, David Choi, Petko Georgiev, Daniel Toyama, Aja Huang, Roman Ring, Igor Babuschkin, Timo Ewalds, Mahyar Bordbar, Sarah Henderson, Sergio Gómez Colmenarejo, Aäron van den Oord, Wojciech Marian Czarnecki, Nando de Freitas, Oriol Vinyals
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  StarCraft II is one of the most challenging simulated reinforcement learning
+environments; it is partially observable, stochastic, multi-agent, and
+mastering StarCraft II requires strategic planning over long time horizons with
+real-time low-level execution. It also has an active professional competitive
+scene. StarCraft II is uniquely suited for advancing offline RL algorithms,
+both because of its challenging nature and because Blizzard has released a
+massive dataset of millions of StarCraft II games played by human players. This
+paper leverages that and establishes a benchmark, called AlphaStar Unplugged,
+introducing unprecedented challenges for offline reinforcement learning. We
+define a dataset (a subset of Blizzard's release), tools standardizing an API
+for machine learning methods, and an evaluation protocol. We also present
+baseline agents, including behavior cloning, offline variants of actor-critic
+and MuZero. We improve the state of the art of agents using only offline data,
+and we achieve 90% win rate against previously published AlphaStar behavior
+cloning agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures, previous version published as a NeurIPS 2021
+  workshop: https://openreview.net/forum?id=Np8Pumfoty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Worker Activity Recognition in Manufacturing Line Using Near-body
+  Electric Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungho Suh, Vitor Fortes Rey, Sizhen Bian, Yu-Chi Huang, Jože M. Rožanec, Hooman Tavakoli Ghinani, Bo Zhou, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing industries strive to improve production efficiency and product
+quality by deploying advanced sensing and control systems. Wearable sensors are
+emerging as a promising solution for achieving this goal, as they can provide
+continuous and unobtrusive monitoring of workers' activities in the
+manufacturing line. This paper presents a novel wearable sensing prototype that
+combines IMU and body capacitance sensing modules to recognize worker
+activities in the manufacturing line. To handle these multimodal sensor data,
+we propose and compare early, and late sensor data fusion approaches for
+multi-channel time-series convolutional neural networks and deep convolutional
+LSTM. We evaluate the proposed hardware and neural network model by collecting
+and annotating sensor data using the proposed sensing prototype and Apple
+Watches in the testbed of the manufacturing line. Experimental results
+demonstrate that our proposed methods achieve superior performance compared to
+the baseline methods, indicating the potential of the proposed approach for
+real-world applications in manufacturing industries. Furthermore, the proposed
+sensing prototype with a body capacitive sensor and feature fusion method
+improves by 6.35%, yielding a 9.38% higher macro F1 score than the proposed
+sensing prototype without a body capacitive sensor and Apple Watch data,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A data-driven approach to predict decision point choice during normal
+  and evacuation wayfinding in multi-story buildings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Feng, Panchamy Krishnakumari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding pedestrian route choice behavior in complex buildings is
+important to ensure pedestrian safety. Previous studies have mostly used
+traditional data collection methods and discrete choice modeling to understand
+the influence of different factors on pedestrian route and exit choice,
+particularly in simple indoor environments. However, research on pedestrian
+route choice in complex buildings is still limited. This paper presents a
+data-driven approach for understanding and predicting the pedestrian decision
+point choice during normal and emergency wayfinding in a multi-story building.
+For this, we first built an indoor network representation and proposed a data
+mapping technique to map VR coordinates to the indoor representation. We then
+used a well-established machine learning algorithm, namely the random forest
+(RF) model to predict pedestrian decision point choice along a route during
+four wayfinding tasks in a multi-story building. Pedestrian behavioral data in
+a multi-story building was collected by a Virtual Reality experiment. The
+results show a much higher prediction accuracy of decision points using the RF
+model (i.e., 93% on average) compared to the logistic regression model. The
+highest prediction accuracy was 96% for task 3. Additionally, we tested the
+model performance combining personal characteristics and we found that personal
+characteristics did not affect decision point choice. This paper demonstrates
+the potential of applying a machine learning algorithm to study pedestrian
+route choice behavior in complex indoor buildings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Face <span class="highlight-title">Dataset</span>: Guiding StyleGAN to Generate Labeled Synthetic
+  Face Image <span class="highlight-title">Dataset</span> for Underrepresented Group 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a machine learning model to generalize effectively to unseen data within
+a particular problem domain, it is well-understood that the data needs to be of
+sufficient size and representative of real-world scenarios. Nonetheless,
+real-world datasets frequently have overrepresented and underrepresented
+groups. One solution to mitigate bias in machine learning is to leverage a
+diverse and representative dataset. Training a model on a dataset that covers
+all demographics is crucial to reducing bias in machine learning. However,
+collecting and labeling large-scale datasets has been challenging, prompting
+the use of synthetic data generation and active labeling to decrease the costs
+of manual labeling. The focus of this study was to generate a robust face image
+dataset using the StyleGAN model. In order to achieve a balanced distribution
+of the dataset among different demographic groups, a synthetic dataset was
+created by controlling the generation process of StyleGaN and annotated for
+different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures,submitted to AMLD Africa 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Physical World Adversarial Robustness of Vehicle Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Tianyuan Zhang, Shuangcheng Liu, Weiyu Ji, Zichao Zhang, Gang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks can compromise the robustness of real-world detection
+models. However, evaluating these models under real-world conditions poses
+challenges due to resource-intensive experiments. Virtual simulations offer an
+alternative, but the absence of standardized benchmarks hampers progress.
+Addressing this, we propose an innovative instant-level data generation
+pipeline using the CARLA simulator. Through this pipeline, we establish the
+Discrete and Continuous Instant-level (DCI) dataset, enabling comprehensive
+experiments involving three detection models and three physical adversarial
+attacks. Our findings highlight diverse model performances under adversarial
+conditions. Yolo v6 demonstrates remarkable resilience, experiencing just a
+marginal 6.59% average drop in average precision (AP). In contrast, the ASA
+attack yields a substantial 14.51% average AP reduction, twice the effect of
+other algorithms. We also note that static scenes yield higher recognition AP
+values, and outcomes remain relatively consistent across varying weather
+conditions. Intriguingly, our study suggests that advancements in adversarial
+attack algorithms may be approaching its ``limitation''.In summary, our work
+underscores the significance of adversarial attacks in real-world contexts and
+introduces the DCI dataset as a versatile benchmark. Our findings provide
+valuable insights for enhancing the robustness of detection models and offer
+guidance for future research endeavors in the realm of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to forecast power generation in wind farms? Insights from leveraging
+  hierarchical structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas English, Mahdi Abolghasemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting of renewable energy generation provides key insights which may
+help with decision-making towards global decarbonisation. Renewable energy
+generation can often be represented through cross-sectional hierarchies,
+whereby a single farm may have multiple individual generators. Hierarchical
+forecasting through reconciliation has demonstrated a significant increase in
+the quality of forecasts both theoretically and empirically. However, it is not
+evident whether forecasts generated by individual temporal and cross-sectional
+aggregation can be superior to integrated cross-temporal forecasts and to
+individual forecasts on more granular data. In this study, we investigate the
+accuracies of different cross-sectional and cross-temporal reconciliation
+methods using both linear regression and gradient boosting machine learning for
+forecasting wind farm power generation. We found that cross-temporal
+reconciliation is superior to individual cross-sectional reconciliation at
+multiple temporal aggregations. Cross-temporally reconciled machine learning
+base forecasts also demonstrated a high accuracy at coarser temporal
+granularities, which may encourage adoption for short-term wind forecasts. We
+also show that linear regression can outperform machine learning models across
+most levels in cross-sectional wind time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wide Gaps and Clustering Axioms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mieczysław A. Kłopotek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widely applied k-means algorithm produces clusterings that violate our
+expectations with respect to high/low similarity/density and is in conflict
+with Kleinberg's axiomatic system for distance based clustering algorithms that
+formalizes those expectations in a natural way. k-means violates in particular
+the consistency axiom. We hypothesise that this clash is due to the not
+explicated expectation that the data themselves should have the property of
+being clusterable in order to expect the algorithm clustering hem to fit a
+clustering axiomatic system. To demonstrate this, we introduce two new
+clusterability properties, variational k-separability and residual
+k-separability and show that then the Kleinberg's consistency axiom holds for
+k-means operating in the Euclidean or non-Euclidean space. Furthermore, we
+propose extensions of k-means algorithm that fit approximately the Kleinberg's
+richness axiom that does not hold for k-means. In this way, we reconcile
+k-means with Kleinberg's axiomatic framework in Euclidean and non-Euclidean
+settings. Besides contribution to the theory of axiomatic frameworks of
+clustering and for clusterability theory, practical contribution is the
+possibility to construct {datasets for testing purposes of algorithms
+optimizing k-means cost function. This includes a method of construction of
+{clusterable data with known in advance global optimum.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 Theorems. arXiv admin note: substantial text overlap with
+  arXiv:2211.17036</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Silo Prototypical Calibration for Federated Learning with Non-IID
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Qi, Lei Meng, Zitan Chen, Han Hu, Hui Lin, Xiangxu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning aims to learn a global model on the server side that
+generalizes to all clients in a privacy-preserving manner, by leveraging the
+local models from different clients. Existing solutions focus on either
+regularizing the objective functions among clients or improving the aggregation
+mechanism for the improved model generalization capability. However, their
+performance is typically limited by the dataset biases, such as the
+heterogeneous data distributions and the missing classes. To address this
+issue, this paper presents a cross-silo prototypical calibration method
+(FedCSPC), which takes additional prototype information from the clients to
+learn a unified feature space on the server side. Specifically, FedCSPC first
+employs the Data Prototypical Modeling (DPM) module to learn data patterns via
+clustering to aid calibration. Subsequently, the cross-silo prototypical
+calibration (CSPC) module develops an augmented contrastive learning method to
+improve the robustness of the calibration, which can effectively project
+cross-source features into a consistent space while maintaining clear decision
+boundaries. Moreover, the CSPC module's ease of implementation and
+plug-and-play characteristics make it even more remarkable. Experiments were
+conducted on four datasets in terms of performance comparison, ablation study,
+in-depth analysis and case study, and the results verified that FedCSPC is
+capable of learning the consistent features across different data sources of
+the same class under the guidance of calibrated model, which leads to better
+performance than the state-of-the-art methods. The source codes have been
+released at https://github.com/qizhuang-qz/FedCSPC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PURL: Safe and Effective Sanitization of Link Decoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoor Munir, Patrick Lee, Umar Iqbal, Zubair Shafiq, Sandra Siby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While privacy-focused browsers have taken steps to block third-party cookies
+and browser fingerprinting, novel tracking methods that bypass existing
+defenses continue to emerge. Since trackers need to exfiltrate information from
+the client- to server-side through link decoration regardless of the tracking
+technique they employ, a promising orthogonal approach is to detect and
+sanitize tracking information in decorated links. We present PURL, a
+machine-learning approach that leverages a cross-layer graph representation of
+webpage execution to safely and effectively sanitize link decoration. Our
+evaluation shows that PURL significantly outperforms existing countermeasures
+in terms of accuracy and reducing website breakage while being robust to common
+evasion techniques. We use PURL to perform a measurement study on top-million
+websites. We find that link decorations are widely abused by well-known
+advertisers and trackers to exfiltrate user information collected from browser
+storage, email addresses, and scripts involved in fingerprinting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applied metamodelling for ATM performance simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoffer Riis, Francisco N. Antunes, Tatjana Bolić, Gérald Gurtner, Andrew Cook, Carlos Lima Azevedo, Francisco Câmara Pereira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Air traffic management (ATM) simulators for planing and operations
+can be challenging due to their modelling complexity. This paper presents XALM
+(eXplainable Active Learning Metamodel), a three-step framework integrating
+active learning and SHAP (SHapley Additive exPlanations) values into simulation
+metamodels for supporting ATM decision-making. XALM efficiently uncovers hidden
+relationships among input and output variables in ATM simulators, those usually
+of interest in policy analysis. Our experiments show XALM's predictive
+performance comparable to the XGBoost metamodel with fewer simulations.
+Additionally, XALM exhibits superior explanatory capabilities compared to
+non-active learning metamodels.
+  Using the `Mercury' (flight and passenger) ATM simulator, XALM is applied to
+a real-world scenario in Paris Charles de Gaulle airport, extending an arrival
+manager's range and scope by analysing six variables. This case study
+illustrates XALM's effectiveness in enhancing simulation interpretability and
+understanding variable interactions. By addressing computational challenges and
+improving explainability, XALM complements traditional simulation-based
+analyses.
+  Lastly, we discuss two practical approaches for reducing the computational
+burden of the metamodelling further: we introduce a stopping criterion for
+active learning based on the inherent uncertainty of the metamodel, and we show
+how the simulations used for the metamodel can be reused across key performance
+indicators, thus decreasing the overall number of simulations needed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Machine Learning-based Fish Stock Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Lüdtke, Maria E. Pierce
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate assessment of fish stocks is crucial for sustainable fisheries
+management. However, existing statistical stock assessment models can have low
+forecast performance of relevant stock parameters like recruitment or spawning
+stock biomass, especially in ecosystems that are changing due to global warming
+and other anthropogenic stressors. In this paper, we investigate the use of
+machine learning models to improve the estimation and forecast of such stock
+parameters. We propose a hybrid model that combines classical statistical stock
+assessment models with supervised ML, specifically gradient boosted trees. Our
+hybrid model leverages the initial estimate provided by the classical model and
+uses the ML model to make a post-hoc correction to improve accuracy. We
+experiment with five different stocks and find that the forecast accuracy of
+recruitment and spawning stock biomass improves considerably in most cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Fragile Earth Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A reading <span class="highlight-title">survey</span> on adversarial machine learning: Adversarial attacks
+  and their understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Kotyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning has empowered us to train neural networks for complex data with
+high performance. However, with the growing research, several vulnerabilities
+in neural networks have been exposed. A particular branch of research,
+Adversarial Machine Learning, exploits and understands some of the
+vulnerabilities that cause the neural networks to misclassify for near original
+input. A class of algorithms called adversarial attacks is proposed to make the
+neural networks misclassify for various tasks in different domains. With the
+extensive and growing research in adversarial attacks, it is crucial to
+understand the classification of adversarial attacks. This will help us
+understand the vulnerabilities in a systematic order and help us to mitigate
+the effects of adversarial attacks. This article provides a survey of existing
+adversarial attacks and their understanding based on different perspectives. We
+also provide a brief overview of existing adversarial defences and their
+limitations in mitigating the effect of adversarial attacks. Further, we
+conclude with a discussion on the future research directions in the field of
+adversarial machine learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Falkner-Skan type equations via Legendre and Chebyshev Neural
+  Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Afzal Aghaei, Kourosh Parand, Ali Nikkhah, Shakila Jaberi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a new deep-learning architecture for solving the non-linear
+Falkner-Skan equation is proposed. Using Legendre and Chebyshev neural blocks,
+this approach shows how orthogonal polynomials can be used in neural networks
+to increase the approximation capability of artificial neural networks. In
+addition, utilizing the mathematical properties of these functions, we overcome
+the computational complexity of the backpropagation algorithm by using the
+operational matrices of the derivative. The efficiency of the proposed method
+is carried out by simulating various configurations of the Falkner-Skan
+equation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expediting Neural Network Verification via Network Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyi Zhong, Ruiwei Wang, Siau-Cheng Khoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide range of verification methods have been proposed to verify the safety
+properties of deep neural networks ensuring that the networks function
+correctly in critical applications. However, many well-known verification tools
+still struggle with complicated network architectures and large network sizes.
+In this work, we propose a network reduction technique as a pre-processing
+method prior to verification. The proposed method reduces neural networks via
+eliminating stable ReLU neurons, and transforming them into a sequential neural
+network consisting of ReLU and Affine layers which can be handled by the most
+verification tools. We instantiate the reduction technique on the
+state-of-the-art complete and incomplete verification tools, including
+alpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of
+benchmarks indicate that the proposed technique can significantly reduce neural
+networks and speed up existing verification tools. Furthermore, the experiment
+results also show that network reduction can improve the availability of
+existing verification tools on many networks by reducing them into sequential
+neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Zhou, Huanran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning is inseparable from normalization layers.
+Researchers have proposed various normalization functions, and each of them has
+both advantages and disadvantages. In response, efforts have been made to
+design a unified normalization function that combines all normalization
+procedures and mitigates their weaknesses. We also proposed a new normalization
+function called Adaptive Fusion Normalization. Through experiments, we
+demonstrate AFN outperforms the previous normalization techniques in domain
+generalization and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2106.01899 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Binary Federated Learning with Client-Level Differential Privacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lumin Liu, Jun Zhang, Shenghui Song, Khaled B. Letaief
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a privacy-preserving collaborative learning
+framework, and differential privacy can be applied to further enhance its
+privacy protection. Existing FL systems typically adopt Federated Average
+(FedAvg) as the training algorithm and implement differential privacy with a
+Gaussian mechanism. However, the inherent privacy-utility trade-off in these
+systems severely degrades the training performance if a tight privacy budget is
+enforced. Besides, the Gaussian mechanism requires model weights to be of
+high-precision. To improve communication efficiency and achieve a better
+privacy-utility trade-off, we propose a communication-efficient FL training
+algorithm with differential privacy guarantee. Specifically, we propose to
+adopt binary neural networks (BNNs) and introduce discrete noise in the FL
+setting. Binary model parameters are uploaded for higher communication
+efficiency and discrete noise is added to achieve the client-level differential
+privacy protection. The achieved performance guarantee is rigorously proved,
+and it is shown to depend on the level of discrete noise. Experimental results
+based on MNIST and Fashion-MNIST datasets will demonstrate that the proposed
+training algorithm achieves client-level privacy protection with performance
+gain while enjoying the benefits of low communication overhead from binary
+model updates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures, accepted by IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HomOpt: A Homotopy-Based Hyperparameter Optimization Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sophia J. Abraham, Kehelwala D. G. Maduranga, Jeffery Kinnison, Zachariah Carmichael, Jonathan D. Hauenstein, Walter J. Scheirer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning has achieved remarkable success over the past couple of
+decades, often attributed to a combination of algorithmic innovations and the
+availability of high-quality data available at scale. However, a third critical
+component is the fine-tuning of hyperparameters, which plays a pivotal role in
+achieving optimal model performance. Despite its significance, hyperparameter
+optimization (HPO) remains a challenging task for several reasons. Many HPO
+techniques rely on naive search methods or assume that the loss function is
+smooth and continuous, which may not always be the case. Traditional methods,
+like grid search and Bayesian optimization, often struggle to quickly adapt and
+efficiently search the loss landscape. Grid search is computationally
+expensive, while Bayesian optimization can be slow to prime. Since the search
+space for HPO is frequently high-dimensional and non-convex, it is often
+challenging to efficiently find a global minimum. Moreover, optimal
+hyperparameters can be sensitive to the specific dataset or task, further
+complicating the search process. To address these issues, we propose a new
+hyperparameter optimization method, HomOpt, using a data-driven approach based
+on a generalized additive model (GAM) surrogate combined with homotopy
+optimization. This strategy augments established optimization methodologies to
+boost the performance and effectiveness of any given method with faster
+convergence to the optimum on continuous, discrete, and categorical domain
+spaces. We compare the effectiveness of HomOpt applied to multiple optimization
+techniques (e.g., Random Search, TPE, Bayes, and SMAC) showing improved
+objective performance on many standardized machine learning benchmarks and
+challenging open-set recognition tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Q-Network for Stochastic Process Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuangheng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is a powerful approach for training an optimal policy
+to solve complex problems in a given system. This project aims to demonstrate
+the application of reinforcement learning in stochastic process environments
+with missing information, using Flappy Bird and a newly developed stock trading
+environment as case studies. We evaluate various structures of Deep Q-learning
+networks and identify the most suitable variant for the stochastic process
+environment. Additionally, we discuss the current challenges and propose
+potential improvements for further work in environment-building and
+reinforcement learning techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry-Preserving Program Representations for Learning Code Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Pei, Weichen Li, Qirui Jin, Shuyang Liu, Scott Geng, Lorenzo Cavallaro, Junfeng Yang, Suman Jana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise in automated program
+reasoning, a crucial aspect of many security tasks. However, existing LLM
+architectures for code are often borrowed from other domains like natural
+language processing, raising concerns about their generalization and robustness
+to unseen code. A key generalization challenge is to incorporate the knowledge
+of code semantics, including control and data flow, into the LLM architectures.
+  Drawing inspiration from examples of convolution layers exploiting
+translation symmetry, we explore how code symmetries can enhance LLM
+architectures for program analysis and modeling. We present a rigorous
+group-theoretic framework that formally defines code symmetries as
+semantics-preserving transformations and provides techniques for precisely
+reasoning about symmetry preservation within LLM architectures. Using this
+framework, we introduce a novel variant of self-attention that preserves
+program symmetries, demonstrating its effectiveness in generalization and
+robustness through detailed experimental evaluations across different binary
+and source code analysis tasks. Overall, our code symmetry framework offers
+rigorous and powerful reasoning techniques that can guide the future
+development of specialized LLMs for code and advance LLM-guided program
+reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Graph Neural Diffusion Based on Constrained Dirichlet Energy
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoji Fu, Mohammed Haroon Dupty, Yanfei Dong, Lee Wee Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit graph neural networks (GNNs) have emerged as a potential approach to
+enable GNNs to capture long-range dependencies effectively. However, poorly
+designed implicit GNN layers can experience over-smoothing or may have limited
+adaptability to learn data geometry, potentially hindering their performance in
+graph learning problems. To address these issues, we introduce a geometric
+framework to design implicit graph diffusion layers based on a parameterized
+graph Laplacian operator. Our framework allows learning the geometry of vertex
+and edge spaces, as well as the graph gradient operator from data. We further
+show how implicit GNN layers can be viewed as the fixed-point solution of a
+Dirichlet energy minimization problem and give conditions under which it may
+suffer from over-smoothing. To overcome the over-smoothing problem, we design
+our implicit graph diffusion layer as the solution of a Dirichlet energy
+minimization problem with constraints on vertex features, enabling it to trade
+off smoothing with the preservation of node feature information. With an
+appropriate hyperparameter set to be larger than the largest eigenvalue of the
+parameterized graph Laplacian, our framework guarantees a unique equilibrium
+and quick convergence. Our models demonstrate better performance than leading
+implicit and explicit GNNs on benchmark datasets for node and graph
+classification tasks, with substantial accuracy improvements observed for some
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do You Remember? Overcoming Catastrophic Forgetting for Fake Audio
+  Detection <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohui Zhang, Jiangyan Yi, Jianhua Tao, Chenglong Wang, Chuyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current fake audio detection algorithms have achieved promising performances
+on most datasets. However, their performance may be significantly degraded when
+dealing with audio of a different dataset. The orthogonal weight modification
+to overcome catastrophic forgetting does not consider the similarity of genuine
+audio across different datasets. To overcome this limitation, we propose a
+continual learning algorithm for fake audio detection to overcome catastrophic
+forgetting, called Regularized Adaptive Weight Modification (RAWM). When
+fine-tuning a detection network, our approach adaptively computes the direction
+of weight modification according to the ratio of genuine utterances and fake
+utterances. The adaptive modification direction ensures the network can
+effectively detect fake audio on the new dataset while preserving its knowledge
+of old model, thus mitigating catastrophic forgetting. In addition, genuine
+audio collected from quite different acoustic conditions may skew their feature
+distribution, so we introduce a regularization constraint to force the network
+to remember the old distribution in this regard. Our method can easily be
+generalized to related fields, like speech emotion recognition. We also
+evaluate our approach across multiple datasets and obtain a significant
+performance improvement on cross-dataset experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40th Internation Conference on Machine Learning (ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Large Language Model Generalization with Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Grosse, Juhan Bae, Cem Anil, Nelson Elhage, Alex Tamkin, Amirhossein Tajdini, Benoit Steiner, Dustin Li, Esin Durmus, Ethan Perez, Evan Hubinger, Kamilė Lukošiūtė, Karina Nguyen, Nicholas Joseph, Sam McCandlish, Jared Kaplan, Samuel R. Bowman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trying to gain better visibility into a machine learning model in order
+to understand and mitigate the associated risks, a potentially valuable source
+of evidence is: which training examples most contribute to a given behavior?
+Influence functions aim to answer a counterfactual: how would the model's
+parameters (and hence its outputs) change if a given sequence were added to the
+training set? While influence functions have produced insights for small
+models, they are difficult to scale to large language models (LLMs) due to the
+difficulty of computing an inverse-Hessian-vector product (IHVP). We use the
+Eigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)
+approximation to scale influence functions up to LLMs with up to 52 billion
+parameters. In our experiments, EK-FAC achieves similar accuracy to traditional
+influence function estimators despite the IHVP computation being orders of
+magnitude faster. We investigate two algorithmic techniques to reduce the cost
+of computing gradients of candidate training sequences: TF-IDF filtering and
+query batching. We use influence functions to investigate the generalization
+patterns of LLMs, including the sparsity of the influence patterns, increasing
+abstraction with scale, math and programming abilities, cross-lingual
+generalization, and role-playing behavior. Despite many apparently
+sophisticated forms of generalization, we identify a surprising limitation:
+influences decay to near-zero when the order of key phrases is flipped.
+Overall, influence functions give us a powerful new tool for studying the
+generalization properties of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>119 pages, 47 figures, 22 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOMINO: Domain-invariant Hyperdimensional Classification for
+  Multi-Sensor Time Series Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyao Wang, Luke Chen, Mohammad Abdullah Al Faruque
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid evolution of the Internet of Things, many real-world
+applications utilize heterogeneously connected sensors to capture time-series
+information. Edge-based machine learning (ML) methodologies are often employed
+to analyze locally collected data. However, a fundamental issue across
+data-driven ML approaches is distribution shift. It occurs when a model is
+deployed on a data distribution different from what it was trained on, and can
+substantially degrade model performance. Additionally, increasingly
+sophisticated deep neural networks (DNNs) have been proposed to capture spatial
+and temporal dependencies in multi-sensor time series data, requiring intensive
+computational resources beyond the capacity of today's edge devices. While
+brain-inspired hyperdimensional computing (HDC) has been introduced as a
+lightweight solution for edge-based learning, existing HDCs are also vulnerable
+to the distribution shift challenge. In this paper, we propose DOMINO, a novel
+HDC learning framework addressing the distribution shift problem in noisy
+multi-sensor time-series data. DOMINO leverages efficient and parallel matrix
+operations on high-dimensional space to dynamically identify and filter out
+domain-variant dimensions. Our evaluation on a wide range of multi-sensor time
+series classification tasks shows that DOMINO achieves on average 2.04% higher
+accuracy than state-of-the-art (SOTA) DNN-based domain generalization
+techniques, and delivers 7.83x faster training and 26.94x faster inference.
+More importantly, DOMINO performs notably better when learning from partially
+labeled and highly imbalanced data, providing 10.93x higher robustness against
+hardware noises than SOTA DNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynJax: Structured Probability Distributions for JAX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Stanojević, Laurent Sartran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of deep learning software libraries enabled significant
+progress in the field by allowing users to focus on modeling, while letting the
+library to take care of the tedious and time-consuming task of optimizing
+execution for modern hardware accelerators. However, this has benefited only
+particular types of deep learning models, such as Transformers, whose
+primitives map easily to the vectorized computation. The models that explicitly
+account for structured objects, such as trees and segmentations, did not
+benefit equally because they require custom algorithms that are difficult to
+implement in a vectorized form.
+  SynJax directly addresses this problem by providing an efficient vectorized
+implementation of inference algorithms for structured distributions covering
+alignment, tagging, segmentation, constituency trees and spanning trees. With
+SynJax we can build large-scale differentiable models that explicitly model
+structure in the data. The code is available at
+https://github.com/deepmind/synjax.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Dotzel, Gang Wu, Andrew Li, Muhammad Umar, Yun Ni, Mohamed S. Abdelfattah, Zhiru Zhang, Liqun Cheng, Martin G. Dixon, Norman P. Jouppi, Quoc V. Le, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has become a mainstream compression technique for reducing model
+size, computational requirements, and energy consumption for modern deep neural
+networks (DNNs). With the improved numerical support in recent hardware,
+including multiple variants of integer and floating point, mixed-precision
+quantization has become necessary to achieve high-quality results with low
+model cost. Prior mixed-precision quantization methods have performed a
+post-training quantization search, which compromises on accuracy, or a
+differentiable quantization search, which leads to high memory usage from
+branching. Therefore, we propose the first one-shot mixed-precision
+quantization search that eliminates the need for retraining in both integer and
+low-precision floating point models. We evaluate our floating-point and integer
+quantization search (FLIQS) on multiple convolutional networks and vision
+transformer models to discover Pareto-optimal models. Our approach discovers
+models that improve upon uniform precision, manual mixed-precision, and recent
+integer quantization search methods. With the proposed integer quantization
+search, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and
+ResNet-50 by 0.90% points with equivalent model cost over previous methods.
+Additionally, for the first time, we explore a novel mixed-precision
+floating-point search and improve MobileNetV2 by up to 0.98% points compared to
+prior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously
+search a joint quantization and neural architecture space and improve the
+ImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2
+search space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-rate discretely-modulated continuous-variable quantum key
+  distribution using quantum machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qin Liao, Jieyu Liu, Anqi Huang, Lei Huang, Zhuoying Fei, Xiquan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a high-rate scheme for discretely-modulated continuous-variable
+quantum key distribution (DM CVQKD) using quantum machine learning
+technologies, which divides the whole CVQKD system into three parts, i.e., the
+initialization part that is used for training and estimating quantum
+classifier, the prediction part that is used for generating highly correlated
+raw keys, and the data-postprocessing part that generates the final secret key
+string shared by Alice and Bob. To this end, a low-complexity quantum k-nearest
+neighbor (QkNN) classifier is designed for predicting the lossy
+discretely-modulated coherent states (DMCSs) at Bob's side. The performance of
+the proposed QkNN-based CVQKD especially in terms of machine learning metrics
+and complexity is analyzed, and its theoretical security is proved by using
+semi-definite program (SDP) method. Numerical simulation shows that the secret
+key rate of our proposed scheme is explicitly superior to the existing DM CVQKD
+protocols, and it can be further enhanced with the increase of modulation
+variance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSformer: A Double Sampling <span class="highlight-title">Transformer</span> for Multivariate Time Series
+  Long-term Prediction <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengqing Yu, Fei Wang, Zezhi Shao, Tao Sun, Lin Wu, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series long-term prediction, which aims to predict the
+change of data in a long time, can provide references for decision-making.
+Although transformer-based models have made progress in this field, they
+usually do not make full use of three features of multivariate time series:
+global information, local information, and variables correlation. To
+effectively mine the above three features and establish a high-precision
+prediction model, we propose a double sampling transformer (DSformer), which
+consists of the double sampling (DS) block and the temporal variable attention
+(TVA) block. Firstly, the DS block employs down sampling and piecewise sampling
+to transform the original series into feature vectors that focus on global
+information and local information respectively. Then, TVA block uses temporal
+attention and variable attention to mine these feature vectors from different
+dimensions and extract key information. Finally, based on a parallel structure,
+DSformer uses multiple TVA blocks to mine and integrate different features
+obtained from DS blocks respectively. The integrated feature information is
+passed to the generative decoder based on a multi-layer perceptron to realize
+multivariate time series long-term prediction. Experimental results on nine
+real-world datasets show that DSformer can outperform eight existing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023 (FULL paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Structure-aware Graph Contrastive Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Yang, Yuan Liu, Zijuan Zhao, Peijin Ding, Wenqian Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Graph Neural Network (GNN), as a graph representation learning
+method, is constrained by label information. However, Graph Contrastive
+Learning (GCL) methods, which tackle the label problem effectively, mainly
+focus on the feature information of the global graph or small subgraph
+structure (e.g., the first-order neighborhood). In the paper, we propose a
+Local Structure-aware Graph Contrastive representation Learning method (LS-GCL)
+to model the structural information of nodes from multiple views. Specifically,
+we construct the semantic subgraphs that are not limited to the first-order
+neighbors. For the local view, the semantic subgraph of each target node is
+input into a shared GNN encoder to obtain the target node embeddings at the
+subgraph-level. Then, we use a pooling function to generate the subgraph-level
+graph embeddings. For the global view, considering the original graph preserves
+indispensable semantic information of nodes, we leverage the shared GNN encoder
+to learn the target node embeddings at the global graph-level. The proposed
+LS-GCL model is optimized to maximize the common information among similar
+instances at three various perspectives through a multi-level contrastive loss
+function. Experimental results on five datasets illustrate that our method
+outperforms state-of-the-art graph representation learning approaches for both
+node classification and link prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Rule Injection for ComplEx Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodi Ma, Anthony Colas, Yuejie Wang, Ali Sadeghian, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in neural knowledge graph inference attempt to combine logic
+rules with knowledge graph embeddings to benefit from prior knowledge. However,
+they usually cannot avoid rule grounding, and injecting a diverse set of rules
+has still not been thoroughly explored. In this work, we propose InjEx, a
+mechanism to inject multiple types of rules through simple constraints, which
+capture definite Horn rules. To start, we theoretically prove that InjEx can
+inject such rules. Next, to demonstrate that InjEx infuses interpretable prior
+knowledge into the embedding space, we evaluate InjEx on both the knowledge
+graph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.
+Our experimental results reveal that InjEx outperforms both baseline KGC models
+as well as specialized few-shot models while maintaining its scalability and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Different Time-series-<span class="highlight-title">Transformer</span> (TST) Architectures: A Case
+  Study in Battery Life Prediction for Electric Vehicles (EVs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niranjan Sitapure, Atharva Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, battery technology for electric vehicles (EVs) has been a
+major focus, with a significant emphasis on developing new battery materials
+and chemistries. However, accurately predicting key battery parameters, such as
+state-of-charge (SOC) and temperature, remains a challenge for constructing
+advanced battery management systems (BMS). Existing battery models do not
+comprehensively cover all parameters affecting battery performance, including
+non-battery-related factors like ambient temperature, cabin temperature,
+elevation, and regenerative braking during EV operation. Due to the difficulty
+of incorporating these auxiliary parameters into traditional models, a
+data-driven approach is suggested. Time-series-transformers (TSTs), leveraging
+multiheaded attention and parallelization-friendly architecture, are explored
+alongside LSTM models. Novel TST architectures, including encoder TST + decoder
+LSTM and a hybrid TST-LSTM, are also developed and compared against existing
+models. A dataset comprising 72 driving trips in a BMW i3 (60 Ah) is used to
+address battery life prediction in EVs, aiming to create accurate TST models
+that incorporate environmental, battery, vehicle driving, and heating circuit
+data to predict SOC and battery temperature for future time steps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Approximation and Learning Rates for Deep Convolutional Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shao-Bo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on approximation and learning performance analysis for
+deep convolutional neural networks with zero-padding and max-pooling. We prove
+that, to approximate $r$-smooth function, the approximation rates of deep
+convolutional neural networks with depth $L$ are of order $ (L^2/\log
+L)^{-2r/d} $, which is optimal up to a logarithmic factor. Furthermore, we
+deduce almost optimal learning rates for implementing empirical risk
+minimization over deep convolutional neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Adversarial Detection without Extra Model: Training Loss
+  Should Change <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien Cheng Chyou, Hung-Ting Su, Winston H. Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial robustness poses a critical challenge in the deployment of deep
+learning models for real-world applications. Traditional approaches to
+adversarial training and supervised detection rely on prior knowledge of attack
+types and access to labeled training data, which is often impractical. Existing
+unsupervised adversarial detection methods identify whether the target model
+works properly, but they suffer from bad accuracies owing to the use of common
+cross-entropy training loss, which relies on unnecessary features and
+strengthens adversarial attacks. We propose new training losses to reduce
+useless features and the corresponding detection method without prior knowledge
+of adversarial attacks. The detection rate (true positive rate) against all
+given white-box attacks is above 93.9% except for attacks without limits
+(DF($\infty$)), while the false positive rate is barely 2.5%. The proposed
+method works well in all tested attack types and the false positive rates are
+even better than the methods good at certain types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AdvML in ICML 2023
+  code:https://github.com/CycleBooster/Unsupervised-adversarial-detection-without-extra-model</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Asynchronous Decentralized Q-Learning: Two Timescale Analysis By
+  Persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bora Yongacoglu, Gürdal Arslan, Serdar Yüksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-stationarity is a fundamental challenge in multi-agent reinforcement
+learning (MARL), where agents update their behaviour as they learn. Many
+theoretical advances in MARL avoid the challenge of non-stationarity by
+coordinating the policy updates of agents in various ways, including
+synchronizing times at which agents are allowed to revise their policies.
+Synchronization enables analysis of many MARL algorithms via multi-timescale
+methods, but such synchrony is infeasible in many decentralized applications.
+In this paper, we study an asynchronous variant of the decentralized Q-learning
+algorithm, a recent MARL algorithm for stochastic games. We provide sufficient
+conditions under which the asynchronous algorithm drives play to equilibrium
+with high probability. Our solution utilizes constant learning rates in the
+Q-factor update, which we show to be critical for relaxing the synchrony
+assumptions of earlier work. Our analysis also applies to asynchronous
+generalizations of a number of other algorithms from the regret testing
+tradition, whose performance is analyzed by multi-timescale methods that study
+Markov chains obtained via policy update dynamics. This work extends the
+applicability of the decentralized Q-learning algorithm and its relatives to
+settings in which parameters are selected in an independent manner, and tames
+non-stationarity without imposing the coordination assumptions of prior work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G-Mix: A Generalized Mixup Learning Framework Towards Flat Minima 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Li, Bo Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have demonstrated promising results in various
+complex tasks. However, current DNNs encounter challenges with
+over-parameterization, especially when there is limited training data
+available. To enhance the generalization capability of DNNs, the Mixup
+technique has gained popularity. Nevertheless, it still produces suboptimal
+outcomes. Inspired by the successful Sharpness-Aware Minimization (SAM)
+approach, which establishes a connection between the sharpness of the training
+loss landscape and model generalization, we propose a new learning framework
+called Generalized-Mixup, which combines the strengths of Mixup and SAM for
+training DNN models. The theoretical analysis provided demonstrates how the
+developed G-Mix framework enhances generalization. Additionally, to further
+optimize DNN performance with the G-Mix framework, we introduce two novel
+algorithms: Binary G-Mix and Decomposed G-Mix. These algorithms partition the
+training data into two subsets based on the sharpness-sensitivity of each
+example to address the issue of "manifold intrusion" in Mixup. Both theoretical
+explanations and experimental results reveal that the proposed BG-Mix and
+DG-Mix algorithms further enhance model generalization across multiple datasets
+and models, achieving state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of the Evolution of Advanced <span class="highlight-title">Transformer</span>-Based Language Models:
+  Experiments on Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour Eddine Zekaoui, Siham Yousfi, Maryem Rhanoui, Mounia Mikram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opinion mining, also known as sentiment analysis, is a subfield of natural
+language processing (NLP) that focuses on identifying and extracting subjective
+information in textual material. This can include determining the overall
+sentiment of a piece of text (e.g., positive or negative), as well as
+identifying specific emotions or opinions expressed in the text, that involves
+the use of advanced machine and deep learning techniques. Recently,
+transformer-based language models make this task of human emotion analysis
+intuitive, thanks to the attention mechanism and parallel computation. These
+advantages make such models very powerful on linguistic tasks, unlike recurrent
+neural networks that spend a lot of time on sequential processing, making them
+prone to fail when it comes to processing long text. The scope of our paper
+aims to study the behaviour of the cutting-edge Transformer-based language
+models on opinion mining and provide a high-level comparison between them to
+highlight their key particularities. Additionally, our comparative study shows
+leads and paves the way for production engineers regarding the approach to
+focus on and is useful for researchers as it provides guidelines for future
+research subjects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imbalanced Large Graph Learning Framework for FPGA Logic Elements
+  Packing Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixiong Di, Runzhe Tao, Lin Chen, Qiang Wu, Yibo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Packing is a required step in a typical FPGA CAD flow. It has high impacts to
+the performance of FPGA placement and routing. Early prediction of packing
+results can guide design optimization and expedite design closure. In this
+work, we propose an imbalanced large graph learning framework, ImLG, for
+prediction of whether logic elements will be packed after placement.
+Specifically, we propose dedicated feature extraction and feature aggregation
+methods to enhance the node representation learning of circuit graphs. With
+imbalanced distribution of packed and unpacked logic elements, we further
+propose techniques such as graph oversampling and mini-batch training for this
+imbalanced learning task in large circuit graphs. Experimental results
+demonstrate that our framework can improve the F1 score by 42.82% compared to
+the most recent Gaussian-based prediction method. Physical design results show
+that the proposed method can assist the placer in improving routed wirelength
+by 0.93% and SLICE occupation by 0.89%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tractability of approximation by general shallow networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrushikesh Mhaskar, Tong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a sharper version of the results in the paper
+Dimension independent bounds for general shallow networks; Neural Networks,
+\textbf{123} (2020), 142-152. Let $\mathbb{X}$ and $\mathbb{Y}$ be compact
+metric spaces. We consider approximation of functions of the form $
+x\mapsto\int_{\mathbb{Y}} G( x, y)d\tau( y)$, $ x\in\mathbb{X}$, by
+$G$-networks of the form $ x\mapsto \sum_{k=1}^n a_kG( x, y_k)$, $ y_1,\cdots,
+y_n\in\mathbb{Y}$, $a_1,\cdots, a_n\in\mathbb{R}$. Defining the dimensions of
+$\mathbb{X}$ and $\mathbb{Y}$ in terms of covering numbers, we obtain dimension
+independent bounds on the degree of approximation in terms of $n$, where also
+the constants involved are all dependent at most polynomially on the
+dimensions. Applications include approximation by power rectified linear unit
+networks, zonal function networks, certain radial basis function networks as
+well as the important problem of function extension to higher dimensional
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Global Search for Efficient Preliminary Trajectory Design with
+  Deep Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjian Li, Amlan Sinha, Ryne Beeson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preliminary trajectory design is a global search problem that seeks multiple
+qualitatively different solutions to a trajectory optimization problem. Due to
+its high dimensionality and non-convexity, and the frequent adjustment of
+problem parameters, the global search becomes computationally demanding. In
+this paper, we exploit the clustering structure in the solutions and propose an
+amortized global search (AmorGS) framework. We use deep generative models to
+predict trajectory solutions that share similar structures with previously
+solved problems, which accelerates the global search for unseen parameter
+values. Our method is evaluated using De Jong's 5th function and a low-thrust
+circular restricted three-body problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Inter-Neuron Covariability Induces Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ahmed Shah, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability to adversarial perturbations is a major flaw of Deep Neural
+Networks (DNNs) that raises question about their reliability when in real-world
+scenarios. On the other hand, human perception, which DNNs are supposed to
+emulate, is highly robust to such perturbations, indicating that there may be
+certain features of the human perception that make it robust but are not
+represented in the current class of DNNs. One such feature is that the activity
+of biological neurons is correlated and the structure of this correlation tends
+to be rather rigid over long spans of times, even if it hampers performance and
+learning. We hypothesize that integrating such constraints on the activations
+of a DNN would improve its adversarial robustness, and, to test this
+hypothesis, we have developed the Self-Consistent Activation (SCA) layer, which
+comprises of neurons whose activations are consistent with each other, as they
+conform to a fixed, but learned, covariability pattern. When evaluated on image
+and sound recognition tasks, the models with a SCA layer achieved high
+accuracy, and exhibited significantly greater robustness than multi-layer
+perceptron models to state-of-the-art Auto-PGD adversarial attacks
+\textit{without being trained on adversarially perturbed data
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PMU measurements based short-term voltage stability assessment of power
+  systems via deep transfer learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Li, Shitu Zhang, Yuanzheng Li, Jiting Cao, Shuyue Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has emerged as an effective solution for addressing the
+challenges of short-term voltage stability assessment (STVSA) in power systems.
+However, existing deep learning-based STVSA approaches face limitations in
+adapting to topological changes, sample labeling, and handling small datasets.
+To overcome these challenges, this paper proposes a novel phasor measurement
+unit (PMU) measurements-based STVSA method by using deep transfer learning. The
+method leverages the real-time dynamic information captured by PMUs to create
+an initial dataset. It employs temporal ensembling for sample labeling and
+utilizes least squares generative adversarial networks (LSGAN) for data
+augmentation, enabling effective deep learning on small-scale datasets.
+Additionally, the method enhances adaptability to topological changes by
+exploring connections between different faults. Experimental results on the
+IEEE 39-bus test system demonstrate that the proposed method improves model
+evaluation accuracy by approximately 20% through transfer learning, exhibiting
+strong adaptability to topological changes. Leveraging the self-attention
+mechanism of the Transformer model, this approach offers significant advantages
+over shallow learning methods and other deep learning-based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Instrumentation & Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning
+  with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulan Gao, Hao Sun, Zengxiang Li, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) addresses data privacy concerns by enabling
+collaborative training of AI models across distributed data owners. Wide
+adoption of FL faces the fundamental challenges of data heterogeneity and the
+large scale of data owners involved. In this paper, we investigate the prospect
+of Transformer-based FL models for achieving generalization and personalization
+in this setting. We conduct extensive comparative experiments involving FL with
+Transformers, ResNet, and personalized ResNet-based FL approaches under various
+scenarios. These experiments consider varying numbers of data owners to
+demonstrate Transformers' advantages over deep neural networks in large-scale
+heterogeneous FL tasks. In addition, we analyze the superior performance of
+Transformers by comparing the Centered Kernel Alignment (CKA) representation
+similarity across different layers and FL models to gain insight into the
+reasons behind their promising capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraPhSyM: Graph Physical Synthesis Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Agiza, Rajarshi Roy, Teodor Dumitru Ene, Saad Godil, Sherief Reda, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model
+for fast and accurate estimation of post-physical synthesis circuit delay and
+area metrics from pre-physical synthesis circuit netlists. Once trained,
+GraPhSyM provides accurate visibility of final design metrics to early EDA
+stages, such as logic synthesis, without running the slow physical synthesis
+flow, enabling global co-optimization across stages. Additionally, the swift
+and precise feedback provided by GraPhSym is instrumental for
+machine-learning-based EDA optimization frameworks. Given a gate-level netlist
+of a circuit represented as a graph, GraPhSyM utilizes graph structure,
+connectivity, and electrical property features to predict the impact of
+physical synthesis transformations such as buffer insertion and gate sizing.
+When trained on a dataset of 6000 prefix adder designs synthesized at an
+aggressive delay target, GraPhSyM can accurately predict the post-synthesis
+delay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s
+inference time. Furthermore, we illustrate the compositionality of GraPhSyM by
+employing the model trained on a fixed delay target to accurately anticipate
+post-synthesis metrics at a variety of unseen delay targets. Lastly, we report
+promising generalization capabilities of the GraPhSyM model when it is
+evaluated on circuits different from the adders it was exclusively trained on.
+The results show the potential for GraPhSyM to serve as a powerful tool for
+advanced optimization techniques and as an oracle for EDA machine learning
+frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCAD'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing the switching operation in monoclonal antibody production:
+  Economic MPC and reinforcement learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandra A. Obiri, Song Bo, Bernard T. Agyeman, Benjamin Decardi-Nelson, Jinfeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monoclonal antibodies (mAbs) have emerged as indispensable assets in
+medicine, and are currently at the forefront of biopharmaceutical product
+development. However, the growing market demand and the substantial doses
+required for mAb clinical treatments necessitate significant progress in its
+large-scale production. Most of the processes for industrial mAb production
+rely on batch operations, which result in significant downtime. The shift
+towards a fully continuous and integrated manufacturing process holds the
+potential to boost product yield and quality, while eliminating the extra
+expenses associated with storing intermediate products. The integrated
+continuous mAb production process can be divided into the upstream and
+downstream processes. One crucial aspect that ensures the continuity of the
+integrated process is the switching of the capture columns, which are typically
+chromatography columns operated in a fed-batch manner downstream. Due to the
+discrete nature of the switching operation, advanced process control algorithms
+such as economic MPC (EMPC) are computationally difficult to implement. This is
+because an integer nonlinear program (INLP) needs to be solved online at each
+sampling time. This paper introduces two computationally-efficient approaches
+for EMPC implementation, namely, a sigmoid function approximation approach and
+a rectified linear unit (ReLU) approximation approach. It also explores the
+application of deep reinforcement learning (DRL). These three methods are
+compared to the traditional switching approach which is based on a 1% product
+breakthrough rule and which involves no optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting and explaining nonlinear material response using deep
+  Physically Guided Neural Networks with Internal Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Orera-Echeverria, Jacobo Ayensa-Jiménez, Manuel Doblare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonlinear materials are often difficult to model with classical state model
+theory because they have a complex and sometimes inaccurate physical and
+mathematical description or we simply do not know how to describe such
+materials in terms of relations between external and internal variables. In
+many disciplines, Neural Network methods have arisen as powerful tools to
+identify very complex and non-linear correlations. In this work, we use the
+very recently developed concept of Physically Guided Neural Networks with
+Internal Variables (PGNNIV) to discover constitutive laws using a model-free
+approach and training solely with measured force-displacement data. PGNNIVs
+make a particular use of the physics of the problem to enforce constraints on
+specific hidden layers and are able to make predictions without internal
+variable data. We demonstrate that PGNNIVs are capable of predicting both
+internal and external variables under unseen load scenarios, regardless of the
+nature of the material considered (linear, with hardening or softening behavior
+and hyperelastic), unravelling the constitutive law of the material hence
+explaining its nature altogether, placing the method in what is known as
+eXplainable Artificial Intelligence (XAI).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main text: 25 pages, 6 figures. Appendices: 13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings
+  for Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumyabrata Chaudhuri, Saumik Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Action Recognition (VAR) is a challenging task due to its inherent
+complexities. Though different approaches have been explored in the literature,
+designing a unified framework to recognize a large number of human actions is
+still a challenging problem. Recently, Multi-Modal Learning (MML) has
+demonstrated promising results in this domain. In literature, 2D skeleton or
+pose modality has often been used for this task, either independently or in
+conjunction with the visual information (RGB modality) present in videos.
+However, the combination of pose, visual information, and text attributes has
+not been explored yet, though text and pose attributes independently have been
+proven to be effective in numerous computer vision tasks. In this paper, we
+present the first pose augmented Vision-language model (VLM) for VAR. Notably,
+our scheme achieves an accuracy of 92.81% and 73.02% on two popular human video
+action recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even
+without any video data pre-training, and an accuracy of 96.11% and 75.75% after
+kinetics pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancements In Crowd-Monitoring System: A Comprehensive Analysis of
+  Systematic Approaches and Automation Algorithms: State-of-The-Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Ameen, Richard Stone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Growing apprehensions surrounding public safety have captured the attention
+of numerous governments and security agencies across the globe. These entities
+are increasingly acknowledging the imperative need for reliable and secure
+crowd-monitoring systems to address these concerns. Effectively managing human
+gatherings necessitates proactive measures to prevent unforeseen events or
+complications, ensuring a safe and well-coordinated environment. The scarcity
+of research focusing on crowd monitoring systems and their security
+implications has given rise to a burgeoning area of investigation, exploring
+potential approaches to safeguard human congregations effectively. Crowd
+monitoring systems depend on a bifurcated approach, encompassing vision-based
+and non-vision-based technologies. An in-depth analysis of these two
+methodologies will be conducted in this research. The efficacy of these
+approaches is contingent upon the specific environment and temporal context in
+which they are deployed, as they each offer distinct advantages. This paper
+endeavors to present an in-depth analysis of the recent incorporation of
+artificial intelligence (AI) algorithms and models into automated systems,
+emphasizing their contemporary applications and effectiveness in various
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Assistant Language Understanding On Device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aas, Hisham Abdelsalam, Irina Belousova, Shruti Bhargava, Jianpeng Cheng, Robert Daland, Joris Driesen, Federico Flego, Tristan Guigue, Anders Johannsen, Partha Lal, Jiarui Lu, Joel Ruben Antony Moniz, Nathan Perkins, Dhivya Piraviperumal, Stephen Pulman, Diarmuid Ó Séaghdha, David Q. Sun, John Torr, Marco Del Vecchio, Jay Wacker, Jason D. Williams, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has recently become feasible to run personal digital assistants on phones
+and other personal devices. In this paper we describe a design for a natural
+language understanding system that runs on device. In comparison to a
+server-based assistant, this system is more private, more reliable, faster,
+more expressive, and more accurate. We describe what led to key choices about
+architecture and technologies. For example, some approaches in the dialog
+systems literature are difficult to maintain over time in a deployment setting.
+We hope that sharing learnings from our practical experiences may help inform
+future work in the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On genuine invariance learning without weight-tying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artem Moskalev, Anna Sepliarskaia, Erik J. Bekkers, Arnold Smeulders
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate properties and limitations of invariance
+learned by neural networks from the data compared to the genuine invariance
+achieved through invariant weight-tying. To do so, we adopt a group theoretical
+perspective and analyze invariance learning in neural networks without
+weight-tying constraints. We demonstrate that even when a network learns to
+correctly classify samples on a group orbit, the underlying decision-making in
+such a model does not attain genuine invariance. Instead, learned invariance is
+strongly conditioned on the input data, rendering it unreliable if the input
+distribution shifts. We next demonstrate how to guide invariance learning
+toward genuine invariance by regularizing the invariance of a model at the
+training. To this end, we propose several metrics to quantify learned
+invariance: (i) predictive distribution invariance, (ii) logit invariance, and
+(iii) saliency invariance similarity. We show that the invariance learned with
+the invariance error regularization closely reassembles the genuine invariance
+of weight-tying models and reliably holds even under a severe input
+distribution shift. Closer analysis of the learned invariance also reveals the
+spectral decay phenomenon, when a network chooses to achieve the invariance to
+a specific transformation group by reducing the sensitivity to any input
+perturbation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIPS: Federated Learning using Intelligent Participant Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Atul Bhope, K. R. Jayaram, Nalini Venkatasubramanian, Ashish Verma, Gegi Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design and implementation of FLIPS, a middleware
+system to manage data and participant heterogeneity in federated learning (FL)
+training workloads. In particular, we examine the benefits of label
+distribution clustering on participant selection in federated learning. FLIPS
+clusters parties involved in an FL training job based on the label distribution
+of their data apriori, and during FL training, ensures that each cluster is
+equitably represented in the participants selected. FLIPS can support the most
+common FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To
+manage platform heterogeneity and dynamic resource availability, FLIPS
+incorporates a straggler management mechanism to handle changing capacities in
+distributed, smart community applications. Privacy of label distributions,
+clustering and participant selection is ensured through a trusted execution
+environment (TEE). Our comprehensive empirical evaluation compares FLIPS with
+random participant selection, as well as two other "smart" selection mechanisms
+- Oort and gradient clustering using two real-world datasets, two different
+non-IID distributions and three common FL algorithms (FedYogi, FedProx and
+FedAvg). We demonstrate that FLIPS significantly improves convergence,
+achieving higher accuracy by 17 - 20 % with 20 - 60 % lower communication
+costs, and these benefits endure in the presence of straggler participants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable and Equitable Math Problem Solving Strategy Prediction in Big
+  Educational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anup Shakya, Vasile Rus, Deepak Venugopal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding a student's problem-solving strategy can have a significant
+impact on effective math learning using Intelligent Tutoring Systems (ITSs) and
+Adaptive Instructional Systems (AISs). For instance, the ITS/AIS can better
+personalize itself to correct specific misconceptions that are indicated by
+incorrect strategies, specific problems can be designed to improve strategies
+and frustration can be minimized by adapting to a student's natural way of
+thinking rather than trying to fit a standard strategy for all. While it may be
+possible for human experts to identify strategies manually in classroom
+settings with sufficient student interaction, it is not possible to scale this
+up to big data. Therefore, we leverage advances in Machine Learning and AI
+methods to perform scalable strategy prediction that is also fair to students
+at all skill levels. Specifically, we develop an embedding called MVec where we
+learn a representation based on the mastery of students. We then cluster these
+embeddings with a non-parametric clustering method where we progressively learn
+clusters such that we group together instances that have approximately
+symmetrical strategies. The strategy prediction model is trained on instances
+sampled from these clusters. This ensures that we train the model over diverse
+strategies and also that strategies from a particular group do not bias the DNN
+model, thus allowing it to optimize its parameters over all groups. Using real
+world large-scale student interaction datasets from MATHia, we implement our
+approach using transformers and Node2Vec for learning the mastery embeddings
+and LSTMs for predicting strategies. We show that our approach can scale up to
+achieve high accuracy by training on a small sample of a large dataset and also
+has predictive equality, i.e., it can predict strategies equally well for
+learners at diverse skill levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures Published as a full paper in the 16th
+  International Conference on Educational Data Mining 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Benchmark Creation for Table Union Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koyena Pal, Aamod Khatiwada, Roee Shraga, Renée J. Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data management has traditionally relied on synthetic data generators to
+generate structured benchmarks, like the TPC suite, where we can control
+important parameters like data size and its distribution precisely. These
+benchmarks were central to the success and adoption of database management
+systems. But more and more, data management problems are of a semantic nature.
+An important example is finding tables that can be unioned. While any two
+tables with the same cardinality can be unioned, table union search is the
+problem of finding tables whose union is semantically coherent. Semantic
+problems cannot be benchmarked using synthetic data. Our current methods for
+creating benchmarks involve the manual curation and labeling of real data.
+These methods are not robust or scalable and perhaps more importantly, it is
+not clear how robust the created benchmarks are. We propose to use generative
+AI models to create structured data benchmarks for table union search. We
+present a novel method for using generative models to create tables with
+specified properties. Using this method, we create a new benchmark containing
+pairs of tables that are both unionable and non-unionable but related. We
+thoroughly evaluate recent existing table union search methods over existing
+benchmarks and our new benchmark. We also present and evaluate a new table
+search methods based on recent large language models over all benchmarks. We
+show that the new benchmark is more challenging for all methods than
+hand-curated benchmarks, specifically, the top-performing method achieves a
+Mean Average Precision of around 60%, over 30% less than its performance on
+existing manually created benchmarks. We examine why this is the case and show
+that the new benchmark permits more detailed analysis of methods, including a
+study of both false positives and false negatives that were not possible with
+existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Generalization in Offline Reinforcement Learning via Unseen
+  State Augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirbhay Modhe, Qiaozi Gao, Ashwin Kalyan, Dhruv Batra, Govind Thattai, Gaurav Sukhatme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) methods strike a balance between
+exploration and exploitation by conservative value estimation -- penalizing
+values of unseen states and actions. Model-free methods penalize values at all
+unseen actions, while model-based methods are able to further exploit unseen
+states via model rollouts. However, such methods are handicapped in their
+ability to find unseen states far away from the available offline data due to
+two factors -- (a) very short rollout horizons in models due to cascading model
+errors, and (b) model rollouts originating solely from states observed in
+offline data. We relax the second assumption and present a novel unseen state
+augmentation strategy to allow exploitation of unseen states where the learned
+model and value estimates generalize. Our strategy finds unseen states by
+value-informed perturbations of seen states followed by filtering out states
+with epistemic uncertainty estimates too high (high error) or too low (too
+similar to seen data). We observe improved performance in several offline RL
+tasks and find that our augmentation strategy consistently leads to overall
+lower average dataset Q-value estimates i.e. more conservative Q-value
+estimates than a baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating and Explaining Large Language Models for Code Using Syntactic
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David N Palacio, Alejandro Velasco, Daniel Rodriguez-Cardenas, Kevin Moran, Denys Poshyvanyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) for code are a family of high-parameter,
+transformer-based neural networks pre-trained on massive datasets of both
+natural and programming languages. These models are rapidly being employed in
+commercial AI-based developer tools, such as GitHub CoPilot. However, measuring
+and explaining their effectiveness on programming tasks is a challenging
+proposition, given their size and complexity. The methods for evaluating and
+explaining LLMs for code are inextricably linked. That is, in order to explain
+a model's predictions, they must be reliably mapped to fine-grained,
+understandable concepts. Once this mapping is achieved, new methods for
+detailed model evaluations are possible. However, most current explainability
+techniques and evaluation benchmarks focus on model robustness or individual
+task performance, as opposed to interpreting model predictions.
+  To this end, this paper introduces ASTxplainer, an explainability method
+specific to LLMs for code that enables both new methods for LLM evaluation and
+visualizations of LLM predictions that aid end-users in understanding model
+predictions. At its core, ASTxplainer provides an automated method for aligning
+token predictions with AST nodes, by extracting and aggregating normalized
+model logits within AST structures. To demonstrate the practical benefit of
+ASTxplainer, we illustrate the insights that our framework can provide by
+performing an empirical evaluation on 12 popular LLMs for code using a curated
+dataset of the most popular GitHub projects. Additionally, we perform a user
+study examining the usefulness of an ASTxplainer-derived visualization of model
+predictions aimed at enabling model users to explain predictions. The results
+of these studies illustrate the potential for ASTxplainer to provide insights
+into LLM effectiveness, and aid end-users in understanding predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting <span class="highlight-title">Prompt</span> Engineering via Declarative Crowdsourcing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya G. Parameswaran, Shreya Shankar, Parth Asawa, Naman Jain, Yujie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are incredibly powerful at comprehending and
+generating data in the form of text, but are brittle and error-prone. There has
+been an advent of toolkits and recipes centered around so-called prompt
+engineering-the process of asking an LLM to do something via a series of
+prompts. However, for LLM-powered data processing workflows, in particular,
+optimizing for quality, while keeping cost bounded, is a tedious, manual
+process. We put forth a vision for declarative prompt engineering. We view LLMs
+like crowd workers and leverage ideas from the declarative crowdsourcing
+literature-including leveraging multiple prompting strategies, ensuring
+internal consistency, and exploring hybrid-LLM-non-LLM approaches-to make
+prompt engineering a more principled process. Preliminary case studies on
+sorting, entity resolution, and imputation demonstrate the promise of our
+approach
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search Engine and Recommendation System for the Music Industry built
+  with JinaAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishita Gopalakrishnan, Sanjjushri Varshini R, Ponshriharini V
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most intriguing debates regarding a novel task is the development
+of search engines and recommendation-based systems in the music industry.
+Studies have shown a drastic depression in the search engine fields, due to
+concerning factors such as speed, accuracy and the format of data given for
+querying. Often people face difficulty in searching for a song solely based on
+the title, hence a solution is proposed to complete a search analysis through a
+single query input and is matched with the lyrics of the songs present in the
+database. Hence it is essential to incorporate cutting-edge technology tools
+for developing a user-friendly search engine. Jina AI is an MLOps framework for
+building neural search engines that are utilized, in order for the user to
+obtain accurate results. Jina AI effectively helps to maintain and enhance the
+quality of performance for the search engine for the query given. An effective
+search engine and a recommendation system for the music industry, built with
+JinaAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Do Anything Now": Characterizing and Evaluating In-The-Wild Jailbreak
+  <span class="highlight-title">Prompt</span>s on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Shen, Zeyuan Chen, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The misuse of large language models (LLMs) has garnered significant attention
+from the general public and LLM vendors. In response, efforts have been made to
+align LLMs with human values and intent use. However, a particular type of
+adversarial prompts, known as jailbreak prompt, has emerged and continuously
+evolved to bypass the safeguards and elicit harmful content from LLMs. In this
+paper, we conduct the first measurement study on jailbreak prompts in the wild,
+with 6,387 prompts collected from four platforms over six months. Leveraging
+natural language processing technologies and graph-based community detection
+methods, we discover unique characteristics of jailbreak prompts and their
+major attack strategies, such as prompt injection and privilege escalation. We
+also observe that jailbreak prompts increasingly shift from public platforms to
+private ones, posing new challenges for LLM vendors in proactive detection. To
+assess the potential harm caused by jailbreak prompts, we create a question set
+comprising 46,800 samples across 13 forbidden scenarios. Our experiments show
+that current LLMs and safeguards cannot adequately defend jailbreak prompts in
+all scenarios. Particularly, we identify two highly effective jailbreak prompts
+which achieve 0.99 attack success rates on ChatGPT (GPT-3.5) and GPT-4, and
+they have persisted online for over 100 days. Our work sheds light on the
+severe and evolving threat landscape of jailbreak prompts. We hope our study
+can facilitate the research community and LLM vendors in promoting safer and
+regulated LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Classification on a Data Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Feuer, Ameya Joshi, Minh Pham, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real world uses of deep learning require predictable model behavior under
+distribution shifts. Models such as CLIP show emergent natural distributional
+robustness comparable to humans, but may require hundreds of millions of
+training samples. Can we train robust learners in a domain where data is
+limited? To rigorously address this question, we introduce JANuS (Joint
+Annotations and Names Set), a collection of four new training datasets with
+images, labels, and corresponding captions, and perform a series of carefully
+controlled investigations of factors contributing to robustness in image
+classification, then compare those results to findings derived from a
+large-scale meta-analysis. Using this approach, we show that standard ResNet-50
+trained with the cross-entropy loss on 2.4 million image samples can attain
+comparable robustness to a CLIP ResNet-50 trained on 400 million samples. To
+our knowledge, this is the first result showing (near) state-of-the-art
+distributional robustness on limited data budgets. Our dataset is available at
+\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used
+to reproduce our experiments can be found at
+\url{https://github.com/penfever/vlhub/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; openreview link:
+  https://openreview.net/forum?id=D5Z2E8CNsD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum algorithms applied to satellite mission planning for Earth
+  observation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serge Rainjonneau, Igor Tokarev, Sergei Iudin, Saaketh Rayaprolu, Karan Pinto, Daria Lemtiuzhnikova, Miras Koblan, Egor Barashov, Mo Kordzanganeh, Markus Pflitsch, Alexey Melnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Earth imaging satellites are a crucial part of our everyday lives that enable
+global tracking of industrial activities. Use cases span many applications,
+from weather forecasting to digital maps, carbon footprint tracking, and
+vegetation monitoring. However, there are limitations; satellites are difficult
+to manufacture, expensive to maintain, and tricky to launch into orbit.
+Therefore, satellites must be employed efficiently. This poses a challenge
+known as the satellite mission planning problem, which could be computationally
+prohibitive to solve on large scales. However, close-to-optimal algorithms,
+such as greedy reinforcement learning and optimization algorithms, can often
+provide satisfactory resolutions. This paper introduces a set of quantum
+algorithms to solve the mission planning problem and demonstrate an advantage
+over the classical algorithms implemented thus far. The problem is formulated
+as maximizing the number of high-priority tasks completed on real datasets
+containing thousands of tasks and multiple satellites. This work demonstrates
+that through solution-chaining and clustering, optimization and machine
+learning algorithms offer the greatest potential for optimal solutions. This
+paper notably illustrates that a hybridized quantum-enhanced reinforcement
+learning agent can achieve a completion percentage of 98.5% over high-priority
+tasks, significantly improving over the baseline greedy methods with a
+completion rate of 75.8%. The results presented in this work pave the way to
+quantum-enabled solutions in the space industry and, more generally, future
+mission planning problems across industries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLEDA -- Lifelong <span class="highlight-title">Self-Supervised</span> Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09027v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09027v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamatha Thota, Dewei Yi, Georgios Leontidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans and animals have the ability to continuously learn new information
+over their lifetime without losing previously acquired knowledge. However,
+artificial neural networks struggle with this due to new information
+conflicting with old knowledge, resulting in catastrophic forgetting. The
+complementary learning systems (CLS) theory suggests that the interplay between
+hippocampus and neocortex systems enables long-term and efficient learning in
+the mammalian brain, with memory replay facilitating the interaction between
+these two systems to reduce forgetting. The proposed Lifelong Self-Supervised
+Domain Adaptation (LLEDA) framework draws inspiration from the CLS theory and
+mimics the interaction between two networks: a DA network inspired by the
+hippocampus that quickly adjusts to changes in data distribution and an SSL
+network inspired by the neocortex that gradually learns domain-agnostic general
+representations. LLEDA's latent replay technique facilitates communication
+between these two networks by reactivating and replaying the past memory latent
+representations to stabilise long-term generalisation and retention without
+interfering with the previously learned information. Extensive experiments
+demonstrate that the proposed method outperforms several other methods
+resulting in a long-term adaptation while being less prone to catastrophic
+forgetting when transferred to new domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures, 6 tables; V2 added more experiments on more
+  domains and fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Explanations: Leveraging Human Input to Align Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09656v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09656v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivian Lai, Yiming Zhang, Chacha Chen, Q. Vera Liao, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a vast collection of explainable AI (XAI) algorithms have been
+developed in recent years, they are often criticized for significant gaps with
+how humans produce and consume explanations. As a result, current XAI
+techniques are often found to be hard to use and lack effectiveness. In this
+work, we attempt to close these gaps by making AI explanations selective -- a
+fundamental property of human explanations -- by selectively presenting a
+subset from a large set of model reasons based on what aligns with the
+recipient's preferences. We propose a general framework for generating
+selective explanations by leveraging human input on a small sample. This
+framework opens up a rich design space that accounts for different selectivity
+goals, types of input, and more. As a showcase, we use a decision-support task
+to explore selective explanations based on what the decision-maker would
+consider relevant to the decision task. We conducted two experimental studies
+to examine three out of a broader possible set of paradigms based on our
+proposed framework: in Study 1, we ask the participants to provide their own
+input to generate selective explanations, with either open-ended or
+critique-based input. In Study 2, we show participants selective explanations
+based on input from a panel of similar users (annotators). Our experiments
+demonstrate the promise of selective explanations in reducing over-reliance on
+AI and improving decision outcomes and subjective perceptions of the AI, but
+also paint a nuanced picture that attributes some of these positive effects to
+the opportunity to provide one's own input to augment AI explanations. Overall,
+our work proposes a novel XAI framework inspired by human communication
+behaviors and demonstrates its potentials to encourage future work to better
+align AI explanations with human production and consumption of explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sliced Optimal Partial Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08049v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08049v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Bai, Berhnard Schmitzer, Mathew Thorpe, Soheil Kolouri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transport (OT) has become exceedingly popular in machine learning,
+data science, and computer vision. The core assumption in the OT problem is the
+equal total amount of mass in source and target measures, which limits its
+application. Optimal Partial Transport (OPT) is a recently proposed solution to
+this limitation. Similar to the OT problem, the computation of OPT relies on
+solving a linear programming problem (often in high dimensions), which can
+become computationally prohibitive. In this paper, we propose an efficient
+algorithm for calculating the OPT problem between two non-negative measures in
+one dimension. Next, following the idea of sliced OT distances, we utilize
+slicing to define the sliced OPT distance. Finally, we demonstrate the
+computational and accuracy benefits of the sliced OPT-based method in various
+numerical experiments. In particular, we show an application of our proposed
+Sliced-OPT in noisy point cloud registration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>modify the link of Github page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer
+  using LSTM, BiLSTM, CNN, GRU, and GloVe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham, Jamil Al Shaqsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and
+GloVe to classify gene mutations using Kaggle's Personalized Medicine:
+Redefining Cancer Treatment dataset. The results were compared against
+well-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and
+their LSTM ensembles. Our model outperformed all other models in terms of
+accuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it
+also needed less training time, resulting in a perfect combination of
+performance and efficiency. This study demonstrates the utility of ensemble
+models for difficult tasks such as gene mutation classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs Understand Glass-Box Models, Discover Surprises, and Suggest
+  Repairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01157v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01157v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin J. Lengerich, Sebastian Bordt, Harsha Nori, Mark E. Nunnally, Yin Aphinyanaphongs, Manolis Kellis, Rich Caruana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that large language models (LLMs) are remarkably good at working with
+interpretable models that decompose complex outcomes into univariate
+graph-represented components. By adopting a hierarchical approach to reasoning,
+LLMs can provide comprehensive model-level summaries without ever requiring the
+entire model to fit in context. This approach enables LLMs to apply their
+extensive background knowledge to automate common tasks in data science such as
+detecting anomalies that contradict prior knowledge, describing potential
+reasons for the anomalies, and suggesting repairs that would remove the
+anomalies. We use multiple examples in healthcare to demonstrate the utility of
+these new capabilities of LLMs, with particular emphasis on Generalized
+Additive Models (GAMs). Finally, we present the package $\texttt{TalkToEBM}$ as
+an open-source LLM-GAM interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Maxout Network-based Feature Fusion and Political Tangent Search
+  Optimizer enabled Transfer Learning for Thalassemia Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hemn Barzan Abdalla, Awder Ahmed, Guoquan Li, Nasser Mustafa, Abdur Rashid Sangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thalassemia is a heritable blood disorder which is the outcome of a genetic
+defect causing lack of production of hemoglobin polypeptide chains. However,
+there is less understanding of the precise frequency as well as sharing in
+these areas. Knowing about the frequency of thalassemia occurrence and
+dependable mutations is thus a significant step in preventing, controlling, and
+treatment planning. Here, Political Tangent Search Optimizer based Transfer
+Learning (PTSO_TL) is introduced for thalassemia detection. Initially, input
+data obtained from a particular dataset is normalized in the data normalization
+stage. Quantile normalization is utilized in the data normalization stage, and
+the data are then passed to the feature fusion phase, in which Weighted
+Euclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data
+augmentation is performed using the oversampling method to increase data
+dimensionality. Lastly, thalassemia detection is carried out by TL, wherein a
+convolutional neural network (CNN) is utilized with hyperparameters from a
+trained model such as Xception. TL is tuned by PTSO, and the training algorithm
+PTSO is presented by merging of Political Optimizer (PO) and Tangent Search
+Algorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and
+f-measure values of about 94.3%, 96.1%, and 95.2%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Deterministic Policy Gradient for End-to-End Communication Systems
+  without Prior Channel Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bolun Zhang, Nguyen Van Huynh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-End (E2E) learning-based concept has been recently introduced to
+jointly optimize both the transmitter and the receiver in wireless
+communication systems. Unfortunately, this E2E learning architecture requires a
+prior differentiable channel model to jointly train the deep neural networks
+(DNNs) at the transceivers, which is hardly obtained in practice. This paper
+aims to solve this issue by developing a deep deterministic policy gradient
+(DDPG)-based framework. In particular, the proposed solution uses the loss
+value of the receiver DNN as the reward to train the transmitter DNN. The
+simulation results then show that our proposed solution can jointly train the
+transmitter and the receiver without requiring the prior channel model. In
+addition, we demonstrate that the proposed DDPG-based solution can achieve
+better detection performance compared to the state-of-the-art solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised machine-learning shock-capturing technique for high-order
+  solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Mateo-Gabín, Kenza Tlales, Eusebio Valero, Esteban Ferrer, Gonzalo Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel unsupervised machine learning shock capturing algorithm
+based on Gaussian Mixture Models (GMMs). The proposed GMM sensor demonstrates
+remarkable accuracy in detecting shocks and is robust across diverse test cases
+without the need for parameter tuning. We compare the GMM-based sensor with
+state-of-the-art alternatives. All methods are integrated into a high-order
+compressible discontinuous Galerkin solver where artificial viscosity can be
+modulated to capture shocks. Supersonic test cases, including high Reynolds
+numbers, showcase the sensor's performance, demonstrating the same
+effectiveness as fine-tuned state-of-the-art sensors. %The nodal DG aproach
+allows for potential applications in sub-cell flux-differencing formulations,
+supersonic feature detection, and mesh refinement. The adaptive nature and
+ability to function without extensive training datasets make this GMM-based
+sensor suitable for complex geometries and varied flow configurations. Our
+study reveals the potential of unsupervised machine learning methods,
+exemplified by the GMM sensor, to improve the robustness and efficiency of
+advanced CFD codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Did we personalize? Assessing personalization by an online reinforcement
+  learning algorithm using resampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05365v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05365v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susobhan Ghosh, Raphael Kim, Prasidh Chhabria, Raaz Dwivedi, Predrag Klasnja, Peng Liao, Kelly Zhang, Susan Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing interest in using reinforcement learning (RL) to
+personalize sequences of treatments in digital health to support users in
+adopting healthier behaviors. Such sequential decision-making problems involve
+decisions about when to treat and how to treat based on the user's context
+(e.g., prior activity level, location, etc.). Online RL is a promising
+data-driven approach for this problem as it learns based on each user's
+historical responses and uses that knowledge to personalize these decisions.
+However, to decide whether the RL algorithm should be included in an
+``optimized'' intervention for real-world deployment, we must assess the data
+evidence indicating that the RL algorithm is actually personalizing the
+treatments to its users. Due to the stochasticity in the RL algorithm, one may
+get a false impression that it is learning in certain states and using this
+learning to provide specific treatments. We use a working definition of
+personalization and introduce a resampling-based methodology for investigating
+whether the personalization exhibited by the RL algorithm is an artifact of the
+RL algorithm stochasticity. We illustrate our methodology with a case study by
+analyzing the data from a physical activity clinical trial called HeartSteps,
+which included the use of an online RL algorithm. We demonstrate how our
+approach enhances data-driven truth-in-advertising of algorithm personalization
+both across all users as well as within specific users in the study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedOBD: Opportunistic Block Dropout for Efficiently Training Large-scale
+  Neural Networks through Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05174v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05174v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Chen, Zichen Chen, Pengcheng Wu, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale neural networks possess considerable expressive power. They are
+well-suited for complex learning tasks in industrial applications. However,
+large-scale models pose significant challenges for training under the current
+Federated Learning (FL) paradigm. Existing approaches for efficient FL training
+often leverage model parameter dropout. However, manipulating individual model
+parameters is not only inefficient in meaningfully reducing the communication
+overhead when training large-scale FL models, but may also be detrimental to
+the scaling efforts and model performance as shown by recent research. To
+address these issues, we propose the Federated Opportunistic Block Dropout
+(FedOBD) approach. The key novelty is that it decomposes large-scale models
+into semantic blocks so that FL participants can opportunistically upload
+quantized blocks, which are deemed to be significant towards training the
+model, to the FL server for aggregation. Extensive experiments evaluating
+FedOBD against four state-of-the-art approaches based on multiple real-world
+datasets show that it reduces the overall communication overhead by more than
+88% compared to the best performing baseline approach, while achieving the
+highest test accuracy. To the best of our knowledge, FedOBD is the first
+approach to perform dropout on FL models at the block level rather than at the
+individual parameter level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Democratising AI: Multiple Meanings, Goals, and Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12642v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12642v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth Seger, Aviv Ovadya, Ben Garfinkel, Divya Siddarth, Allan Dafoe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous parties are calling for the democratisation of AI, but the phrase is
+used to refer to a variety of goals, the pursuit of which sometimes conflict.
+This paper identifies four kinds of AI democratisation that are commonly
+discussed: (1) the democratisation of AI use, (2) the democratisation of AI
+development, (3) the democratisation of AI profits, and (4) the democratisation
+of AI governance. Numerous goals and methods of achieving each form of
+democratisation are discussed. The main takeaway from this paper is that AI
+democratisation is a multifarious and sometimes conflicting concept that should
+not be conflated with improving AI accessibility. If we want to move beyond
+ambiguous commitments to democratising AI, to productive discussions of
+concrete policies and trade-offs, then we need to recognise the principal role
+of the democratisation of AI governance in navigating tradeoffs and risks
+across decisions around use, development, and profits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2 Changed second author affiliation; added citation to section 5.2;
+  edit to author contribution statement; V3 camera ready version for conference
+  proceedings. Minor content changes in response to reviewer comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation
+  from Simulation to multiple Real-World Domains <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08083v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08083v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Gebele, Bonifaz Stuhr, Johann Haselberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation demonstrates great potential to mitigate
+domain shifts by transferring models from labeled source domains to unlabeled
+target domains. While Unsupervised Domain Adaptation has been applied to a wide
+variety of complex vision tasks, only few works focus on lane detection for
+autonomous driving. This can be attributed to the lack of publicly available
+datasets. To facilitate research in these directions, we propose CARLANE, a
+3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE
+encompasses the single-target datasets MoLane and TuLane and the multi-target
+dataset MuLane. These datasets are built from three different domains, which
+cover diverse scenes and contain a total of 163K unique images, 118K of which
+are annotated. In addition we evaluate and report systematic baselines,
+including our own method, which builds upon Prototypical Cross-domain
+Self-supervised Learning. We find that false positive and false negative rates
+of the evaluated domain adaptation methods are high compared to those of fully
+supervised baselines. This affirms the need for benchmarks such as CARLANE to
+further strengthen research in Unsupervised Domain Adaptation for lane
+detection. CARLANE, all evaluated models and the corresponding implementations
+are publicly available at https://carlanebenchmark.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGrad: Fairness Aware Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.10923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.10923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurav Maheshwari, Michaël Perrot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of group fairness in classification, where the
+objective is to learn models that do not unjustly discriminate against
+subgroups of the population. Most existing approaches are limited to simple
+binary tasks or involve difficult to implement training mechanisms which
+reduces their practical applicability. In this paper, we propose FairGrad, a
+method to enforce fairness based on a re-weighting scheme that iteratively
+learns group specific weights based on whether they are advantaged or not.
+FairGrad is easy to implement, accommodates various standard fairness
+definitions, and comes with minimal overhead. Furthermore, we show that it is
+competitive with standard baselines over various datasets including ones used
+in natural language processing and computer vision.
+  FairGrad is available as a PyPI package at -
+https://pypi.org/project/fairgrad
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper is accepted at Transactions on Machine Learning Research.
+  Reviewed on OpenReview: https://openreview.net/forum?id=0f8tU3QwWD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Unlearning of Features and Labels <span class="chip">NDSS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.11577v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.11577v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Warnecke, Lukas Pirch, Christian Wressnegger, Konrad Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Removing information from a machine learning model is a non-trivial task that
+requires to partially revert the training process. This task is unavoidable
+when sensitive data, such as credit card numbers or passwords, accidentally
+enter the model and need to be removed afterwards. Recently, different concepts
+for machine unlearning have been proposed to address this problem. While these
+approaches are effective in removing individual data points, they do not scale
+to scenarios where larger groups of features and labels need to be reverted. In
+this paper, we propose the first method for unlearning features and labels. Our
+approach builds on the concept of influence functions and realizes unlearning
+through closed-form updates of model parameters. It enables to adapt the
+influence of training data on a learning model retrospectively, thereby
+correcting data leaks and privacy issues. For learning models with strongly
+convex loss functions, our method provides certified unlearning with
+theoretical guarantees. For models with non-convex losses, we empirically show
+that unlearning features and labels is effective and significantly faster than
+other strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Network and Distributed System Security Symposium (NDSS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table<span class="highlight-title">GPT</span>: Towards Unifying Tables, Nature Language and Commands into One
+  <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08674v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08674v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangyu Zha, Junlin Zhou, Liyao Li, Rui Wang, Qingyi Huang, Saisai Yang, Jing Yuan, Changbao Su, Xiang Li, Aofeng Su, Tao Zhang, Chen Zhou, Kaizhe Shou, Miao Wang, Wufang Zhu, Guoshan Lu, Chao Ye, Yali Ye, Wentao Ye, Yiming Zhang, Xinglong Deng, Jie Xu, Haobo Wang, Gang Chen, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tables are prevalent in real-world databases, requiring significant time and
+effort for humans to analyze and manipulate. The advancements in large language
+models (LLMs) have made it possible to interact with tables using natural
+language input, bringing this capability closer to reality. In this paper, we
+present TableGPT, a unified fine-tuned framework that enables LLMs to
+understand and operate on tables using external functional commands. It
+introduces the capability to seamlessly interact with tables, enabling a wide
+range of functionalities such as question answering, data manipulation (e.g.,
+insert, delete, query, and modify operations), data visualization, analysis
+report generation, and automated prediction. TableGPT aims to provide
+convenience and accessibility to users by empowering them to effortlessly
+leverage tabular data. At the core of TableGPT lies the novel concept of global
+tabular representations, which empowers LLMs to gain a comprehensive
+understanding of the entire table beyond meta-information. By jointly training
+LLMs on both table and text modalities, TableGPT achieves a deep understanding
+of tabular data and the ability to perform complex operations on tables through
+chain-of-command instructions. Importantly, TableGPT offers the advantage of
+being a self-contained system rather than relying on external API interfaces.
+Moreover, it supports efficient data process flow, query rejection (when
+appropriate) and private deployment, enabling faster domain data fine-tuning
+and ensuring data privacy, which enhances the framework's adaptability to
+specific use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Within-Group Fairness of Screening Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nastaran Okati, Stratis Tsirtsis, Manuel Gomez Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Screening classifiers are increasingly used to identify qualified candidates
+in a variety of selection processes. In this context, it has been recently
+shown that, if a classifier is calibrated, one can identify the smallest set of
+candidates which contains, in expectation, a desired number of qualified
+candidates using a threshold decision rule. This lends support to focusing on
+calibration as the only requirement for screening classifiers. In this paper,
+we argue that screening policies that use calibrated classifiers may suffer
+from an understudied type of within-group unfairness -- they may unfairly treat
+qualified members within demographic groups of interest. Further, we argue that
+this type of unfairness can be avoided if classifiers satisfy within-group
+monotonicity, a natural monotonicity property within each of the groups. Then,
+we introduce an efficient post-processing algorithm based on dynamic
+programming to minimally modify a given calibrated classifier so that its
+probability estimates satisfy within-group monotonicity. We validate our
+algorithm using US Census survey data and show that within-group monotonicity
+can be often achieved at a small cost in terms of prediction granularity and
+shortlist size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Memory- and Time-Efficient Backpropagation for Training Spiking
+  Neural Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14311v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14311v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyan Meng, Mingqing Xiao, Shen Yan, Yisen Wang, Zhouchen Lin, Zhi-Quan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) are promising energy-efficient models for
+neuromorphic computing. For training the non-differentiable SNN models, the
+backpropagation through time (BPTT) with surrogate gradients (SG) method has
+achieved high performance. However, this method suffers from considerable
+memory cost and training time during training. In this paper, we propose the
+Spatial Learning Through Time (SLTT) method that can achieve high performance
+while greatly improving training efficiency compared with BPTT. First, we show
+that the backpropagation of SNNs through the temporal domain contributes just a
+little to the final calculated gradients. Thus, we propose to ignore the
+unimportant routes in the computational graph during backpropagation. The
+proposed method reduces the number of scalar multiplications and achieves a
+small memory occupation that is independent of the total time steps.
+Furthermore, we propose a variant of SLTT, called SLTT-K, that allows
+backpropagation only at K time steps, then the required number of scalar
+multiplications is further reduced and is independent of the total time steps.
+Experiments on both static and neuromorphic datasets demonstrate superior
+training efficiency and performance of our SLTT. In particular, our method
+achieves state-of-the-art accuracy on ImageNet, while the memory cost and
+training time are reduced by more than 70% and 50%, respectively, compared with
+BPTT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A novel Deep Learning approach for one-step Conformal Prediction
+  approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12377v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12377v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia A. Meister, Khuong An Nguyen, Stelios Kapetanakis, Zhiyuan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning predictions with measurable confidence are increasingly
+desirable for real-world problems, especially in high-risk settings. The
+Conformal Prediction (CP) framework is a versatile solution that guarantees a
+maximum error rate given minimal constraints. In this paper, we propose a novel
+conformal loss function that approximates the traditionally two-step CP
+approach in a single step. By evaluating and penalising deviations from the
+stringent expected CP output distribution, a Deep Learning model may learn the
+direct relationship between the input data and the conformal p-values. We carry
+out a comprehensive empirical evaluation to show our novel loss function's
+competitiveness for seven binary and multi-class prediction tasks on five
+benchmark datasets. On the same datasets, our approach achieves significant
+training time reductions up to 86% compared to Aggregated Conformal Prediction
+(ACP), while maintaining comparable approximate validity and predictive
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 15 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Interpretable and Explainable Deep Learning Models for Brain
+  Tumor MRI and COVID-19 Chest X-ray Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Brima, Marcellin Atemkeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning shows promise for medical image analysis but lacks
+interpretability, hindering adoption in healthcare. Attribution techniques that
+explain model reasoning may increase trust in deep learning among clinical
+stakeholders. This paper aimed to evaluate attribution methods for illuminating
+how deep neural networks analyze medical images. Using adaptive path-based
+gradient integration, we attributed predictions from brain tumor MRI and
+COVID-19 chest X-ray datasets made by recent deep convolutional neural network
+models. The technique highlighted possible biomarkers, exposed model biases,
+and offered insights into the links between input and prediction. Our analysis
+demonstrates the method's ability to elucidate model reasoning on these
+datasets. The resulting attributions show promise for improving deep learning
+transparency for domain experts by revealing the rationale behind predictions.
+This study advances model interpretability to increase trust in deep learning
+among healthcare stakeholders.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Spectral Regularization for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03345v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03345v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Frascaroli, Riccardo Benaglia, Matteo Boschini, Luca Moschella, Cosimo Fiorini, Emanuele Rodolà, Simone Calderara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While biological intelligence grows organically as new knowledge is gathered
+throughout life, Artificial Neural Networks forget catastrophically whenever
+they face a changing training data distribution. Rehearsal-based Continual
+Learning (CL) approaches have been established as a versatile and reliable
+solution to overcome this limitation; however, sudden input disruptions and
+memory constraints are known to alter the consistency of their predictions. We
+study this phenomenon by investigating the geometric characteristics of the
+learner's latent space and find that replayed data points of different classes
+increasingly mix up, interfering with classification. Hence, we propose a
+geometric regularizer that enforces weak requirements on the Laplacian spectrum
+of the latent space, promoting a partitioning behavior. We show that our
+proposal, called Continual Spectral Regularizer (CaSpeR), can be easily
+combined with any rehearsal-based CL approach and improves the performance of
+SOTA methods on standard benchmarks. Finally, we conduct additional analysis to
+provide insights into CaSpeR's effects and applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batches Stabilize the Minimum Norm Risk in High Dimensional
+  Overparameterized Linear Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahar Stein Ioushua, Inbar Hasidim, Ofer Shayevitz, Meir Feder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning algorithms that divide the data into batches are prevalent in many
+machine-learning applications, typically offering useful trade-offs between
+computational efficiency and performance. In this paper, we examine the
+benefits of batch-partitioning through the lens of a minimum-norm
+overparameterized linear regression model with isotropic Gaussian features. We
+suggest a natural small-batch version of the minimum-norm estimator, and derive
+an upper bound on its quadratic risk, showing it is inversely proportional to
+the noise level as well as to the overparameterization ratio, for the optimal
+choice of batch size. In contrast to minimum-norm, our estimator admits a
+stable risk behavior that is monotonically increasing in the
+overparameterization ratio, eliminating both the blowup at the interpolation
+point and the double-descent phenomenon. Interestingly, we observe that this
+implicit regularization offered by the batch partition is partially explained
+by feature overlap between the batches. Our bound is derived via a novel
+combination of techniques, in particular normal approximation in the
+Wasserstein metric of noisy projections over random subspaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symmetry & Critical Points for Symmetric Tensor Decomposition Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07886v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07886v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Arjevani, Gal Vinograd
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the nonconvex optimization problem associated with the
+decomposition of a real symmetric tensor into a sum of rank one terms. Use is
+made of the rich symmetry structure to construct infinite families of critical
+points represented by Puiseux series in the problem dimension, and so obtain
+precise analytic estimates on the value of the objective function and the
+Hessian spectrum. The results allow an analytic characterization of various
+obstructions to using local optimization methods, revealing in particular a
+complex array of saddles and minima differing by their symmetry, structure and
+analytic properties. A~desirable phenomenon, occurring for all critical points
+considered, concerns the number of negative Hessian eigenvalues increasing with
+the value of the objective function. Our approach makes use of Newton polyhedra
+as well as results from real algebraic geometry, notably the Curve Selection
+Lemma, to determine the extremal character of degenerate critical points,
+establishing in particular the existence of infinite families of third-order
+saddles which can significantly slow down the optimization process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quadruple-star systems are not always nested triples: a machine learning
+  approach to dynamical stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavan Vynatheya, Rosemary A. Mardling, Adrian S. Hamers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamical stability of quadruple-star systems has traditionally been
+treated as a problem involving two `nested' triples which constitute a
+quadruple. In this novel study, we employed a machine learning algorithm, the
+multi-layer perceptron (MLP), to directly classify 2+2 and 3+1 quadruples based
+on their stability (or long-term boundedness). The training data sets for the
+classification, comprised of $5\times10^5$ quadruples each, were integrated
+using the highly accurate direct $N$-body code MSTAR. We also carried out a
+limited parameter space study of zero-inclination systems to directly compare
+quadruples to triples. We found that both our quadruple MLP models perform
+better than a `nested' triple MLP approach, which is especially significant for
+3+1 quadruples. The classification accuracies for the 2+2 MLP and 3+1 MLP
+models are 94% and 93% respectively, while the scores for the `nested' triple
+approach are 88% and 66% respectively. This is a crucial implication for
+quadruple population synthesis studies. Our MLP models, which are very simple
+and almost instantaneous to implement, are available on GitHub, along with
+Python3 scripts to access them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by MNRAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Deep Generative Models with Generalized Empirical
+  Likelihoods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suman Ravuri, Mélanie Rey, Shakir Mohamed, Marc Deisenroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how well a deep generative model captures a distribution of
+high-dimensional data remains an important open challenge. It is especially
+difficult for certain model classes, such as Generative Adversarial Networks
+and Diffusion Models, whose models do not admit exact likelihoods. In this
+work, we demonstrate that generalized empirical likelihood (GEL) methods offer
+a family of diagnostic tools that can identify many deficiencies of deep
+generative models (DGMs). We show, with appropriate specification of moment
+conditions, that the proposed method can identify which modes have been
+dropped, the degree to which DGMs are mode imbalanced, and whether DGMs
+sufficiently capture intra-class diversity. We show how to combine techniques
+from Maximum Mean Discrepancy and Generalized Empirical Likelihood to create
+not only distribution tests that retain per-sample interpretability, but also
+metrics that include label information. We find that such tests predict the
+degree of mode dropping and mode imbalance up to 60% better than metrics such
+as improved precision/recall. We provide an implementation at
+https://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of
+  submissions)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaloFlow for CaloChallenge <span class="highlight-title">Dataset</span> 1 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudius Krause, Ian Pang, David Shih
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CaloFlow is a new and promising approach to fast calorimeter simulation based
+on normalizing flows. Applying CaloFlow to the photon and charged pion Geant4
+showers of Dataset 1 of the Fast Calorimeter Simulation Challenge 2022, we show
+how it can produce high-fidelity samples with a sampling time that is several
+orders of magnitude faster than Geant4. We demonstrate the fidelity of the
+samples using calorimeter shower images, histograms of high-level features, and
+aggregate metrics such as a classifier trained to distinguish CaloFlow from
+Geant4 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 18 figures, v2: updated pion evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Delay-Aware Hierarchical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12414v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12414v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Po-Chen Lin, Seyyedali Hosseinalipour, Nicolò Michelusi, Christopher Brinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning has gained popularity as a means of training models
+distributed across the wireless edge. The paper introduces delay-aware
+hierarchical federated learning (DFL) to improve the efficiency of distributed
+machine learning (ML) model training by accounting for communication delays
+between edge and cloud. Different from traditional federated learning, DFL
+leverages multiple stochastic gradient descent iterations on device datasets
+within each global aggregation period and intermittently aggregates model
+parameters through edge servers in local subnetworks. During global
+synchronization, the cloud server consolidates local models with the outdated
+global model using a local-global combiner, thus preserving crucial elements of
+both, enhancing learning efficiency under the presence of delay. A set of
+conditions is obtained to achieve the sub-linear convergence rate of O(1/k).
+Based on these findings, an adaptive control algorithm is developed for DFL,
+implementing policies to mitigate energy consumption and communication latency
+while aiming for a sublinear convergence rate. Numerical evaluations show DFL's
+superior performance in terms of faster global model convergence, reduced
+resource consumption, and robustness against communication delays compared to
+existing FL algorithms. In summary, this proposed method offers improved
+efficiency and results when dealing with both convex and non-convex loss
+functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A condensed version of this paper was presented at IEEE Globecom 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Hu, Khemraj Shukla, George Em Karniadakis, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The curse-of-dimensionality (CoD) taxes computational resources heavily with
+exponentially increasing computational cost as the dimension increases. This
+poses great challenges in solving high-dimensional PDEs as Richard Bellman
+first pointed out over 60 years ago. While there has been some recent success
+in solving numerically partial differential equations (PDEs) in high
+dimensions, such computations are prohibitively expensive, and true scaling of
+general nonlinear PDEs to high dimensions has never been achieved. In this
+paper, we develop a new method of scaling up physics-informed neural networks
+(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called
+Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs
+into pieces corresponding to different dimensions and samples randomly a subset
+of these dimensional pieces in each iteration of training PINNs. We
+theoretically prove the convergence guarantee and other desired properties of
+the proposed method. We experimentally demonstrate that the proposed method
+allows us to solve many notoriously hard high-dimensional PDEs, including the
+Hamilton-Jacobi-Bellman (HJB) and the Schr\"{o}dinger equations in thousands of
+dimensions very fast on a single GPU using the PINNs mesh-free approach. For
+instance, we solve nontrivial nonlinear PDEs (one HJB equation and one
+Black-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using
+SDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD
+can be applied to any current and future variants of PINNs to scale them up for
+arbitrary high-dimensional PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Survival Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Archetti, Matteo Matteucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival analysis is a subfield of statistics concerned with modeling the
+occurrence time of a particular event of interest for a population. Survival
+analysis found widespread applications in healthcare, engineering, and social
+sciences. However, real-world applications involve survival datasets that are
+distributed, incomplete, censored, and confidential. In this context, federated
+learning can tremendously improve the performance of survival analysis
+applications. Federated learning provides a set of privacy-preserving
+techniques to jointly train machine learning models on multiple datasets
+without compromising user privacy, leading to a better generalization
+performance. However, despite the widespread development of federated learning
+in recent AI research, few studies focus on federated survival analysis. In
+this work, we present a novel federated algorithm for survival analysis based
+on one of the most successful survival models, the random survival forest. We
+call the proposed method Federated Survival Forest (FedSurF). With a single
+communication round, FedSurF obtains a discriminative power comparable to
+deep-learning-based federated models trained over hundreds of federated
+iterations. Moreover, FedSurF retains all the advantages of random forests,
+namely low computational cost and natural handling of missing values and
+incomplete datasets. These advantages are especially desirable in real-world
+federated environments with multiple small datasets stored on devices with low
+computational capabilities. Numerical experiments compare FedSurF with
+state-of-the-art survival models in federated networks, showing how FedSurF
+outperforms deep-learning-based federated algorithms in realistic environments
+with non-identically distributed data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNA<span class="highlight-title">GPT</span>: A Generalized <span class="highlight-title">Pre-train</span>ed Tool for Versatile DNA Sequence
+  Analysis Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daoan Zhang, Weitong Zhang, Bing He, Yu Zhao, Jianguo Zhang, Chenchen Qin, Jianhua Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GPT has been proven to be capable of extracting general information from
+language sequences, thereby benefiting all downstream tasks. This motivates us
+to use pre-trained models to explore the hidden inherent information in DNA
+sequences. However, data and task requirements in DNA sequence analyses are
+tasked in different formats such as generation, prediction and regression, and
+are complexity and involve different modalities, such as nucleotides sequences
+and, expression levels, etc. Existing BERT-based models are mostly for
+generation tasks and use sequence data as input and output, thus cannot easily
+handle various DNA analysis tasks in one single model. Herein, we propose a
+generalized DNA pre-training DNA model, DNAGPT, that was trained on over 200
+billion base pairs from all the mammals. We enhance the classic GPT model by
+adding binary classification task (DNA sequence order) and numerical regression
+task (guanine-cytosine content prediction) in the pre-training period and
+enhancing the architecture with corresponding embedding layers and encoding
+heads. We also design a comprehensive token language to encode sequence, number
+and task related information in the same token space. Therefore, DNAGPT can
+handle versatile DNA analysis tasks and simultaneously process handle both
+sequence and numerical data. We have evaluated our model on genomic signals and
+regions recognition, pseudo genomes generation and mRNA abudance regression
+tasks. We demonstrate that benefiting from pre-training, DNAGPT can shows
+superior performance than the existing models specially designed for various
+downstreams tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HybMT: Hybrid Meta-Predictor based ML Algorithm for Fast Test Vector
+  Generation <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11312v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11312v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruti Pandey,  Jayadeva, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ML models are increasingly being used to increase the test coverage and
+decrease the overall testing time. This field is still in its nascent stage and
+up till now there were no algorithms that could match or outperform commercial
+tools in terms of speed and accuracy for large circuits. We propose an ATPG
+algorithm HybMT in this paper that finally breaks this barrier. Like sister
+methods, we augment the classical PODEM algorithm that uses recursive
+backtracking. We design a custom 2-level predictor that predicts the input net
+of a logic gate whose value needs to be set to ensure that the output is a
+given value (0 or 1). Our predictor chooses the output from among two
+first-level predictors, where the most effective one is a bespoke neural
+network and the other is an SVM regressor. As compared to a popular,
+state-of-the-art commercial ATPG tool, HybMT shows an overall reduction of
+56.6% in the CPU time without compromising on the fault coverage for the EPFL
+benchmark circuits. HybMT also shows a speedup of 126.4% over the best ML-based
+algorithm while obtaining an equal or better fault coverage for the EPFL
+benchmark circuits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures and 5 tables. Changes from the previous version:
+  We modified our novel neural network model "HybNN" with a skip connection and
+  found a significant improvement in the fault coverage and runtime of our
+  HybMT-based PODEM algorithm. We train on the smaller ISCAS'85 circuits,
+  report the results for the EPFL benchmark circuits (most recent and up to 70X
+  large)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving Tree-Based Inference with TFHE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01254v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01254v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Frery, Andrei Stoian, Roman Bredehoft, Luis Montero, Celia Kherfallah, Benoit Chevallier-Mames, Arthur Meyre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy enhancing technologies (PETs) have been proposed as a way to protect
+the privacy of data while still allowing for data analysis. In this work, we
+focus on Fully Homomorphic Encryption (FHE), a powerful tool that allows for
+arbitrary computations to be performed on encrypted data. FHE has received lots
+of attention in the past few years and has reached realistic execution times
+and correctness.
+  More precisely, we explain in this paper how we apply FHE to tree-based
+models and get state-of-the-art solutions over encrypted tabular data. We show
+that our method is applicable to a wide range of tree-based models, including
+decision trees, random forests, and gradient boosted trees, and has been
+implemented within the Concrete-ML library, which is open-source at
+https://github.com/zama-ai/concrete-ml. With a selected set of use-cases, we
+demonstrate that our FHE version is very close to the unprotected version in
+terms of accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Rule-based Named Entity Recognition and Relation Extraction for
+  Process Model Generation from Natural Language Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Neuberger, Lars Ackermann, Stefan Jablonski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-aware information systems offer extensive advantages to companies,
+facilitating planning, operations, and optimization of day-to-day business
+activities. However, the time-consuming but required step of designing formal
+business process models often hampers the potential of these systems. To
+overcome this challenge, automated generation of business process models from
+natural language text has emerged as a promising approach to expedite this
+step. Generally two crucial subtasks have to be solved: extracting
+process-relevant information from natural language and creating the actual
+model. Approaches towards the first subtask are rule based methods, highly
+optimized for specific domains, but hard to adapt to related applications. To
+solve this issue, we present an extension to an existing pipeline, to make it
+entirely data driven. We demonstrate the competitiveness of our improved
+pipeline, which not only eliminates the substantial overhead associated with
+feature engineering and rule definition, but also enables adaptation to
+different datasets, entity and relation types, and new domains. Additionally,
+the largest available dataset (PET) for the first subtask, contains no
+information about linguistic references between mentions of entities in the
+process description. Yet, the resolution of these mentions into a single visual
+element is essential for high quality process models. We propose an extension
+to the PET dataset that incorporates information about linguistic references
+and a corresponding method for resolving them. Finally, we provide a detailed
+analysis of the inherent challenges in the dataset at hand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review for CoopIS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Membership Inference Attacks against Language Models via Neighbourhood
+  Comparison 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justus Mattern, Fatemehsadat Mireshghallah, Zhijing Jin, Bernhard Schölkopf, Mrinmaya Sachan, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Membership Inference attacks (MIAs) aim to predict whether a data sample was
+present in the training data of a machine learning model or not, and are widely
+used for assessing the privacy risks of language models. Most existing attacks
+rely on the observation that models tend to assign higher probabilities to
+their training samples than non-training points. However, simple thresholding
+of the model score in isolation tends to lead to high false-positive rates as
+it does not account for the intrinsic complexity of a sample. Recent work has
+demonstrated that reference-based attacks which compare model scores to those
+obtained from a reference model trained on similar data can substantially
+improve the performance of MIAs. However, in order to train reference models,
+attacks of this kind make the strong and arguably unrealistic assumption that
+an adversary has access to samples closely resembling the original training
+data. Therefore, we investigate their performance in more realistic scenarios
+and find that they are highly fragile in relation to the data distribution used
+to train reference models. To investigate whether this fragility provides a
+layer of safety, we propose and evaluate neighbourhood attacks, which compare
+model scores for a given sample to scores of synthetically generated neighbour
+texts and therefore eliminate the need for access to the training data
+distribution. We show that, in addition to being competitive with
+reference-based attacks that have perfect knowledge about the training data
+distribution, our attack clearly outperforms existing reference-free attacks as
+well as reference-based attacks with imperfect knowledge, which demonstrates
+the need for a reevaluation of the threat model of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interval Reachability of Nonlinear Dynamical Systems with Neural Network
+  Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saber Jafarpour, Akash Harapanahalli, Samuel Coogan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a computationally efficient framework, based on interval
+analysis, for rigorous verification of nonlinear continuous-time dynamical
+systems with neural network controllers. Given a neural network, we use an
+existing verification algorithm to construct inclusion functions for its
+input-output behavior. Inspired by mixed monotone theory, we embed the
+closed-loop dynamics into a larger system using an inclusion function of the
+neural network and a decomposition function of the open-loop system. This
+embedding provides a scalable approach for safety analysis of the neural
+control loop while preserving the nonlinear structure of the system.
+  We show that one can efficiently compute hyper-rectangular
+over-approximations of the reachable sets using a single trajectory of the
+embedding system. We design an algorithm to leverage this computational
+advantage through partitioning strategies, improving our reachable set
+estimates while balancing its runtime with tunable parameters. We demonstrate
+the performance of this algorithm through two case studies. First, we
+demonstrate this method's strength in complex nonlinear environments. Then, we
+show that our approach matches the performance of the state-of-the art
+verification algorithm for linear discretized systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended L4DC version with proofs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Bipedal Walking for Humanoids with Current Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Pratap Singh, Zhaoming Xie, Pierre Gergondet, Fumio Kanehiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep reinforcement learning (RL) based techniques combined
+with training in simulation have offered a new approach to developing robust
+controllers for legged robots. However, the application of such approaches to
+real hardware has largely been limited to quadrupedal robots with direct-drive
+actuators and light-weight bipedal robots with low gear-ratio transmission
+systems. Application to real, life-sized humanoid robots has been less common
+arguably due to a large sim2real gap. In this paper, we present an approach for
+effectively overcoming the sim2real gap issue for humanoid robots arising from
+inaccurate torque-tracking at the actuator level. Our key idea is to utilize
+the current feedback from the actuators on the real robot, after training the
+policy in a simulation environment artificially degraded with poor
+torque-tracking. Our approach successfully trains a unified, end-to-end policy
+in simulation that can be deployed on a real HRP-5P humanoid robot to achieve
+bipedal locomotion. Through ablations, we also show that a feedforward policy
+architecture combined with targeted dynamics randomization is sufficient for
+zero-shot sim2real success, thus eliminating the need for computationally
+expensive, memory-based network architectures. Finally, we validate the
+robustness of the proposed RL policy by comparing its performance against a
+conventional model-based controller for walking on uneven terrain with the real
+robot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01814v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01814v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greg Yang, Etai Littwin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in
+wide neural networks trained by adaptive optimizers like Adam? Here we show:
+The same dichotomy between feature learning and kernel behaviors (as in SGD)
+holds for general optimizers as well, including Adam -- albeit with a nonlinear
+notion of "kernel." We derive the corresponding "neural tangent" and "maximal
+update" limits for any architecture. Two foundational advances underlie the
+above results: 1) A new Tensor Program language, NEXORT, that can express how
+adaptive optimizers process gradients into updates. 2) The introduction of
+bra-ket notation to drastically simplify expressions and calculations in Tensor
+Programs. This work summarizes and generalizes all previous results in the
+Tensor Programs series of papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the complete version of "Adaptive Optimization in the
+  Infinite-Width Limit" in ICLR 2023,
+  https://openreview.net/forum?id=zgVDqw9ZUES</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intensity-free Integral-based Learning of Marked Temporal Point
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sishun Liu, Ke Deng, Xiuzhen Zhang, Yongli Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the marked temporal point processes (MTPP), a core problem is to
+parameterize the conditional joint PDF (probability distribution function)
+$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.
+The majority of existing studies predefine intensity functions. Their utility
+is challenged by specifying the intensity function's proper form, which is
+critical to balance expressiveness and processing efficiency. Recently, there
+are studies moving away from predefining the intensity function -- one models
+$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point
+processes (TPPs), which do not consider marks. This study aims to develop
+high-fidelity $p^*(m,t)$ for discrete events where the event marks are either
+categorical or numeric in a multi-dimensional continuous space. We propose a
+solution framework IFIB (\underline{I}ntensity-\underline{f}ree
+\underline{I}ntegral-\underline{b}ased process) that models conditional joint
+PDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies
+the process to compel the essential mathematical restrictions. We show the
+desired properties of IFIB and the superior experimental results of IFIB on
+real-world and synthetic datasets. The code is available at
+\url{https://github.com/StepinSilence/IFIB}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer Learning with Deep Tabular Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.15306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.15306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Levin, Valeriia Cherepanova, Avi Schwarzschild, Arpit Bansal, C. Bayan Bruss, Tom Goldstein, Andrew Gordon Wilson, Micah Goldblum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work on deep learning for tabular data demonstrates the strong
+performance of deep tabular models, often bridging the gap between gradient
+boosted decision trees and neural networks. Accuracy aside, a major advantage
+of neural models is that they learn reusable features and are easily fine-tuned
+in new domains. This property is often exploited in computer vision and natural
+language applications, where transfer learning is indispensable when
+task-specific training data is scarce. In this work, we demonstrate that
+upstream data gives tabular neural networks a decisive advantage over widely
+used GBDT models. We propose a realistic medical diagnosis benchmark for
+tabular transfer learning, and we present a how-to guide for using upstream
+data to boost performance with a variety of tabular neural network
+architectures. Finally, we propose a pseudo-feature method for cases where the
+upstream and downstream feature sets differ, a tabular-specific problem
+widespread in real-world applications. Our code is available at
+https://github.com/LevinRoman/tabular-transfer-learning .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral Regularized Kernel Two-Sample Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09201v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09201v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Hagrass, Bharath K. Sriperumbudur, Bing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last decade, an approach that has gained a lot of popularity to
+tackle non-parametric testing problems on general (i.e., non-Euclidean) domains
+is based on the notion of reproducing kernel Hilbert space (RKHS) embedding of
+probability distributions. The main goal of our work is to understand the
+optimality of two-sample tests constructed based on this approach. First, we
+show that the popular MMD (maximum mean discrepancy) two-sample test is not
+optimal in terms of the separation boundary measured in Hellinger distance.
+Second, we propose a modification to the MMD test based on spectral
+regularization by taking into account the covariance information (which is not
+captured by the MMD test) and prove the proposed test to be minimax optimal
+with a smaller separation boundary than that achieved by the MMD test. Third,
+we propose an adaptive version of the above test which involves a data-driven
+strategy to choose the regularization parameter and show the adaptive test to
+be almost minimax optimal up to a logarithmic factor. Moreover, our results
+hold for the permutation variant of the test where the test threshold is chosen
+elegantly through the permutation of the samples. Through numerical experiments
+on synthetic and real-world data, we demonstrate the superior performance of
+the proposed test in comparison to the MMD test.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>63 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Modification Adversarial Attacks for Natural Language Processing:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.00676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.00676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Roth, Yansong Gao, Alsharif Abuadbba, Surya Nepal, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are now many adversarial attacks for natural language processing
+systems. Of these, a vast majority achieve success by modifying individual
+document tokens, which we call here a token-modification attack. Each
+token-modification attack is defined by a specific combination of fundamental
+components, such as a constraint on the adversary or a particular search
+algorithm. Motivated by this observation, we survey existing token-modification
+attacks and extract the components of each. We use an attack-independent
+framework to structure our survey which results in an effective categorisation
+of the field and an easy comparison of components. This survey aims to guide
+new researchers to this field and spark further research into individual attack
+components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 2: updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending against Insertion-based Textual Backdoor Attacks via
+  Attribution <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhao Li, Zhuofeng Wu, Wei Ping, Chaowei Xiao, V. G. Vinod Vydiswaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual backdoor attack, as a novel attack model, has been shown to be
+effective in adding a backdoor to the model during training. Defending against
+such backdoor attacks has become urgent and important. In this paper, we
+propose AttDef, an efficient attribution-based pipeline to defend against two
+insertion-based poisoning attacks, BadNL and InSent. Specifically, we regard
+the tokens with larger attribution scores as potential triggers since larger
+attribution words contribute more to the false prediction results and therefore
+are more likely to be poison triggers. Additionally, we further utilize an
+external pre-trained language model to distinguish whether input is poisoned or
+not. We show that our proposed method can generalize sufficiently well in two
+common attack scenarios (poisoning training data and testing data), which
+consistently improves previous methods. For instance, AttDef can successfully
+mitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%
+(3.99% up) under pre-training and post-training attack defense respectively,
+achieving the new state-of-the-art performance on prediction recovery over four
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2023. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepQ-ViT: Scale Reparameterization for Post-Training Quantization of
+  Vision <span class="highlight-title">Transformer</span>s <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08254v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08254v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Junrui Xiao, Lianwei Yang, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ), which only requires a tiny dataset for
+calibration without end-to-end retraining, is a light and practical model
+compression technique. Recently, several PTQ schemes for vision transformers
+(ViTs) have been presented; unfortunately, they typically suffer from
+non-trivial accuracy degradation, especially in low-bit cases. In this paper,
+we propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale
+reparameterization, to address the above issues. RepQ-ViT decouples the
+quantization and inference processes, where the former employs complex
+quantizers and the latter employs scale-reparameterized simplified quantizers.
+This ensures both accurate quantization and efficient inference, which
+distinguishes it from existing approaches that sacrifice quantization
+performance to meet the target hardware. More specifically, we focus on two
+components with extreme distributions: post-LayerNorm activations with severe
+inter-channel variation and post-Softmax activations with power-law features,
+and initially apply channel-wise quantization and log$\sqrt{2}$ quantization,
+respectively. Then, we reparameterize the scales to hardware-friendly
+layer-wise quantization and log2 quantization for inference, with only slight
+accuracy or computational costs. Extensive experiments are conducted on
+multiple vision tasks with different model variants, proving that RepQ-ViT,
+without hyperparameters and expensive reconstruction procedures, can outperform
+existing strong baselines and encouragingly improve the accuracy of 4-bit PTQ
+of ViTs to a usable level. Code is available at
+https://github.com/zkkli/RepQ-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zhang, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Parameter Identification via a Hyperparameter Optimization Scheme
+  for Autonomous Racing Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01470v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01470v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunki Seong, Chanyoung Chung, David Hyunchul Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this letter, we propose a model parameter identification method via a
+hyperparameter optimization scheme (MI-HPO). Our method adopts an efficient
+explore-exploit strategy to identify the parameters of dynamic models in a
+data-driven optimization manner. We utilize our method for model parameter
+identification of the AV-21, a full-scaled autonomous race vehicle. We then
+incorporate the optimized parameters for the design of model-based planning and
+control systems of our platform. In experiments, MI-HPO exhibits more than 13
+times faster convergence than traditional parameter identification methods.
+Furthermore, the parametric models learned via MI-HPO demonstrate good fitness
+to the given datasets and show generalization ability in unseen dynamic
+scenarios. We further conduct extensive field tests to validate our model-based
+system, demonstrating stable obstacle avoidance and high-speed driving up to
+217 km/h at the Indianapolis Motor Speedway and Las Vegas Motor Speedway. The
+source code for our work and videos of the tests are available at
+https://github.com/hynkis/MI-HPO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures. Published in IEEE Control Systems Letters (L-CSS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus
+  Sample Average Approximation: A Stochastic Dominance Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06833v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06833v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam N. Elmachtoub, Henry Lam, Haofeng Zhang, Yunfan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven stochastic optimization, model parameters of the underlying
+distribution need to be estimated from data in addition to the optimization
+task. Recent literature considers integrating the estimation and optimization
+processes by selecting model parameters that lead to the best empirical
+objective performance. This integrated approach, which we call
+integrated-estimation-optimization (IEO), can be readily shown to outperform
+simple estimate-then-optimize (ETO) when the model is misspecified. In this
+paper, we show that a reverse behavior appears when the model class is
+well-specified and there is sufficient data. Specifically, for a general class
+of nonlinear stochastic optimization problems, we show that simple ETO
+outperforms IEO asymptotically when the model class covers the ground truth, in
+the strong sense of stochastic dominance of the regret. Namely, the entire
+distribution of the regret, not only its mean or other moments, is always
+better for ETO compared to IEO. Our results also apply to constrained,
+contextual optimization problems where the decision depends on observed
+features. Whenever applicable, we also demonstrate how standard sample average
+approximation (SAA) performs the worst when the model class is well-specified
+in terms of regret, and best when it is misspecified. Finally, we provide
+experimental results to support our theoretical comparisons and illustrate when
+our insights hold in finite-sample regimes and under various degrees of
+misspecification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Chan Lee, Daniel Rho, Jong Hwan Ko, Eunbyung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields, also known as coordinate-based or implicit neural
+representations, have shown a remarkable capability of representing,
+generating, and manipulating various forms of signals. For video
+representations, however, mapping pixel-wise coordinates to RGB colors has
+shown relatively low compression performance and slow convergence and inference
+speed. Frame-wise video representation, which maps a temporal coordinate to its
+entire frame, has recently emerged as an alternative method to represent
+videos, improving compression rates and encoding speed. While promising, it has
+still failed to reach the performance of state-of-the-art video compression
+algorithms. In this work, we propose FFNeRV, a novel method for incorporating
+flow information into frame-wise representations to exploit the temporal
+redundancy across the frames in videos inspired by the standard video codecs.
+Furthermore, we introduce a fully convolutional architecture, enabled by
+one-dimensional temporal grids, improving the continuity of spatial features.
+Experimental results show that FFNeRV yields the best performance for video
+compression and frame interpolation among the methods using frame-wise
+representations or neural fields. To reduce the model size even further, we
+devise a more compact convolutional architecture using the group and pointwise
+convolutions. With model compression techniques, including quantization-aware
+training and entropy coding, FFNeRV outperforms widely-used standard video
+codecs (H.264 and HEVC) and performs on par with state-of-the-art video
+compression algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page including code is available at
+  https://maincold2.github.io/ffnerv/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Fine-Tuning of Deep Neural Networks with Hessian-based
+  Generalization Guarantees <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02659v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02659v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Ju, Dongyue Li, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider fine-tuning a pretrained deep neural network on a target task. We
+study the generalization properties of fine-tuning to understand the problem of
+overfitting, which has often been observed (e.g., when the target dataset is
+small or when the training labels are noisy). Existing generalization measures
+for deep networks depend on notions such as distance from the initialization
+(i.e., the pretrained network) of the fine-tuned model and noise stability
+properties of deep networks. This paper identifies a Hessian-based distance
+measure through PAC-Bayesian analysis, which is shown to correlate well with
+observed generalization gaps of fine-tuned models. Theoretically, we prove
+Hessian distance-based generalization bounds for fine-tuned models. We also
+describe an extended study of fine-tuning against label noise, where
+overfitting is against a critical problem; We present an algorithm and a
+generalization error guarantee for this algorithm under a class conditional
+independent noise model. Empirically, we observe that the Hessian-based
+distance measure can match the scale of the observed generalization gap of
+fine-tuned models in practice. We also test our algorithm on several image
+classification tasks with noisy training labels, showing notable gains over
+prior methods, and the Hessian distance measure of the fine-tuned model
+decreases substantially.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages. Appeared in ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Exact Kernel Equivalence for Finite Classification Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Bell, Michael Geyer, David Glickenstein, Amanda Fernandez, Juston Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the equivalence between neural networks and kernel methods by
+deriving the first exact representation of any finite-size parametric
+classification model trained with gradient descent as a kernel machine. We
+compare our exact representation to the well-known Neural Tangent Kernel (NTK)
+and discuss approximation error relative to the NTK and other non-exact path
+kernel formulations. We experimentally demonstrate that the kernel can be
+computed for realistic networks up to machine precision. We use this exact
+kernel to show that our theoretical contribution can provide useful insights
+into the predictions made by neural networks, particularly the way in which
+they generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in
+  Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Rendering for Synthetic Aperture Radar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Wilmanski, Jonathan Tamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is rising interest in differentiable rendering, which allows explicitly
+modeling geometric priors and constraints in optimization pipelines using
+first-order methods such as backpropagation. Incorporating such domain
+knowledge can lead to deep neural networks that are trained more robustly and
+with limited data, as well as the capability to solve ill-posed inverse
+problems. Existing efforts in differentiable rendering have focused on imagery
+from electro-optical sensors, particularly conventional RGB-imagery. In this
+work, we propose an approach for differentiable rendering of Synthetic Aperture
+Radar (SAR) imagery, which combines methods from 3D computer graphics with
+neural rendering. We demonstrate the approach on the inverse graphics problem
+of 3D Object Reconstruction from limited SAR imagery using high-fidelity
+simulated SAR data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the manuscript is an updated preprint which has been
+  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but
+  has not yet been published or processed by IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Razors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wai-yin Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When performing causal discovery, assumptions have to be made on how the true
+causal mechanism corresponds to the underlying joint probability distribution.
+These assumptions are labeled as causal razors in this work. We review numerous
+causal razors that appeared in the literature, and offer a comprehensive
+logical comparison of them. In particular, we scrutinize an unpopular causal
+razor, namely parameter minimality, in multinomial causal models and its
+logical relations with other well-studied causal razors. Our logical result
+poses a dilemma in selecting a reasonable scoring criterion for score-based
+casual search algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages for the main paper. 14 pages for the supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Ramesh, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferability Properties of Graph Neural Networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.04629v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.04629v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luana Ruiz, Luiz F. O. Chamon, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are composed of layers consisting of graph
+convolutions and pointwise nonlinearities. Due to their invariance and
+stability properties, GNNs are provably successful at learning representations
+from data supported on moderate-scale graphs. However, they are difficult to
+learn on large-scale graphs. In this paper, we study the problem of training
+GNNs on graphs of moderate size and transferring them to large-scale graphs. We
+use graph limits called graphons to define limit objects for graph filters and
+GNNs -- graphon filters and graphon neural networks (WNNs) -- which we
+interpret as generative models for graph filters and GNNs. We then show that
+graphon filters and WNNs can be approximated by graph filters and GNNs sampled
+from them on weighted and stochastic graphs. Because the error of these
+approximations can be upper bounded, by a triangle inequality argument we can
+further bound the error of transferring a graph filter or a GNN across graphs.
+Our results show that (i) the transference error decreases with the graph size,
+and (ii) that graph filters have a transferability-discriminability tradeoff
+that in GNNs is alleviated by the scattering behavior of the nonlinearity.
+These findings are demonstrated empirically in a movie recommendation problem
+and in a decentralized control task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Trust Race Prediction? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cangyuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the absence of sensitive race and ethnicity data, researchers, regulators,
+and firms alike turn to proxies. In this paper, I train a Bidirectional Long
+Short-Term Memory (BiLSTM) model on a novel dataset of voter registration data
+from all 50 US states and create an ensemble that achieves up to 36.8% higher
+out of sample (OOS) F1 scores than the best performing machine learning models
+in the literature. Additionally, I construct the most comprehensive database of
+first and surname distributions in the US in order to improve the coverage and
+accuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved
+Firstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality
+benchmark dataset in order to fairly compare existing models and aid future
+model developers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring and Modeling Physical Intrinsic Motivation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13452v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13452v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Martinez, Felix Binder, Haoliang Wang, Nick Haber, Judith Fan, Daniel L. K. Yamins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans are interactive agents driven to seek out situations with interesting
+physical dynamics. Here we formalize the functional form of physical intrinsic
+motivation. We first collect ratings of how interesting humans find a variety
+of physics scenarios. We then model human interestingness responses by
+implementing various hypotheses of intrinsic motivation including models that
+rely on simple scene features to models that depend on forward physics
+prediction. We find that the single best predictor of human responses is
+adversarial reward, a model derived from physical prediction loss. We also find
+that simple scene feature models do not generalize their prediction of human
+responses across all scenarios. Finally, linearly combining the adversarial
+model with the number of collisions in a scene leads to the greatest
+improvement in predictivity of human responses, suggesting humans are driven
+towards scenarios that result in high information gain and physical activity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, accepted to CogSci 2023 with full paper
+  publication in the proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Causal Representation Learning and Deconfounding from Indefinite
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02640v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02640v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Chen, Xinyu Yang, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We redefine causal data from two novel perspectives: the number of causal
+skeletons and the dimension of causal variables, thereby proposing three data
+paradigms. Among them, the indefinite data (like dialogues or video sources) is
+characterized by multi-skeleton structures and multi-value variables. Multi
+skeletons induce low sample utilization, and multi values induce incapability
+of the distribution assumption, both leading to the fact that learning causal
+representation from indefinite data is, as of yet, largely unexplored. We
+design the causal strength variational model to settle down these two problems.
+Specifically, we leverage the causal strength instead of independent noise as
+the latent variable to construct evidence lower bound. By this design ethos,
+The causal strengths of different skeletons are regarded as a distribution and
+can be expressed as a single-valued causal graph matrix. Moreover, considering
+the latent confounders, we disentangle the causal graph G into two relation
+subgraphs O and C. O contains pure relations between observed variables, while
+C represents the relations from latent variables to observed variables. We
+implement the above designs as a dynamic variational inference model, tailored
+to learn causal representation from indefinite data under latent confounding.
+Finally, we conduct comprehensive experiments on synthetic and real-world data
+to demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online learning techniques for prediction of temporal tabular <span class="highlight-title">dataset</span>s
+  with regime changes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00790v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00790v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Wong, Mauricio Barahona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of deep learning to non-stationary temporal datasets can lead
+to overfitted models that underperform under regime changes. In this work, we
+propose a modular machine learning pipeline for ranking predictions on temporal
+panel datasets which is robust under regime changes. The modularity of the
+pipeline allows the use of different models, including Gradient Boosting
+Decision Trees (GBDTs) and Neural Networks, with and without feature
+engineering. We evaluate our framework on financial data for stock portfolio
+prediction, and find that GBDT models with dropout display high performance,
+robustness and generalisability with reduced complexity and computational cost.
+We then demonstrate how online learning techniques, which require no retraining
+of models, can be used post-prediction to enhance the results. First, we show
+that dynamic feature projection improves robustness by reducing drawdown in
+regime changes. Second, we demonstrate that dynamical model ensembling based on
+selection of models with good recent performance leads to improved Sharpe and
+Calmar ratios of out-of-sample predictions. We also evaluate the robustness of
+our pipeline across different data splits and random seeds with good
+reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-based Person Re-identification with Long Short-Term Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuehu Liu, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person Re-Identification (V-ReID) aims to retrieve specific
+persons from raw videos captured by non-overlapped cameras. As a fundamental
+task, it spreads many multimedia and computer vision applications. However, due
+to the variations of persons and scenes, there are still many obstacles that
+must be overcome for high performance. In this work, we notice that both the
+long-term and short-term information of persons are important for robust video
+representations. Thus, we propose a novel deep learning framework named Long
+Short-Term Representation Learning (LSTRL) for effective V-ReID. More
+specifically, to extract long-term representations, we propose a
+Multi-granularity Appearance Extractor (MAE), in which four granularity
+appearances are effectively captured across multiple frames. Meanwhile, to
+extract short-term representations, we propose a Bi-direction Motion Estimator
+(BME), in which reciprocal motion information is efficiently extracted from
+consecutive frames. The MAE and BME are plug-and-play and can be easily
+inserted into existing networks for efficient feature learning. As a result,
+they significantly improve the feature representation ability for V-ReID.
+Extensive experiments on three widely used benchmarks show that our proposed
+approach can deliver better performances than most state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ICIG2023, including 13 pages, 5 figures and
+  5 tables. Modifications may be performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mamba: Bringing Multi-Dimensional ABR to WebRTC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueheng Li, Zicheng Zhang, Hao Chen, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary real-time video communication systems, such as WebRTC, use an
+adaptive bitrate (ABR) algorithm to assure high-quality and low-delay services,
+e.g., promptly adjusting video bitrate according to the instantaneous network
+bandwidth. However, target bitrate decisions in the network and bitrate control
+in the codec are typically incoordinated and simply ignoring the effect of
+inappropriate resolution and frame rate settings also leads to compromised
+results in bitrate control, thus devastatingly deteriorating the quality of
+experience (QoE). To tackle these challenges, Mamba, an end-to-end
+multi-dimensional ABR algorithm is proposed, which utilizes multi-agent
+reinforcement learning (MARL) to maximize the user's QoE by adaptively and
+collaboratively adjusting encoding factors including the quantization
+parameters (QP), resolution, and frame rate based on observed states such as
+network conditions and video complexity information in a video conferencing
+system. We also introduce curriculum learning to improve the training
+efficiency of MARL. Both the in-lab and real-world evaluation results
+demonstrate the remarkable efficacy of Mamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 31st ACM International Conference on
+  Multimedia, October 29-November 3, 2023, Ottawa, ON, Canada. ACM, New York,
+  NY, USA, 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COPA: Efficient Vision-Language <span class="highlight-title">Pre-train</span>ing Through Collaborative
+  Object- and Patch-Text Alignment <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Haiyang Xu, Wei Ye, Qinghao Ye, Chenliang Li, Ming Yan, Bin Bi, Shikun Zhang, Ji Zhang, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Pre-training (VLP) methods based on object detection enjoy
+the rich knowledge of fine-grained object-text alignment but at the cost of
+computationally expensive inference. Recent Visual-Transformer (ViT)-based
+approaches circumvent this issue while struggling with long visual sequences
+without detailed cross-modal alignment information. This paper introduces a
+ViT-based VLP technique that efficiently incorporates object information
+through a novel patch-text alignment mechanism. Specifically, we convert
+object-level signals into patch-level ones and devise a Patch-Text Alignment
+pre-training task (PTA) to learn a text-aware patch detector. By using
+off-the-shelf delicate object annotations in 5\% training images, we jointly
+train PTA with other conventional VLP objectives in an end-to-end manner,
+bypassing the high computational cost of object detection and yielding an
+effective patch detector that accurately detects text-relevant patches, thus
+considerably reducing patch sequences and accelerating computation within the
+ViT backbone. Our experiments on a variety of widely-used benchmarks reveal
+that our method achieves a speedup of nearly 88\% compared to prior VLP models
+while maintaining competitive or superior performance on downstream tasks with
+similar model size and data scale.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cuing Without Sharing: A Federated Cued Speech Recognition Framework via
+  Mutual Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhang, Lei Liu, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cued Speech (CS) is a visual coding tool to encode spoken languages at the
+phonetic level, which combines lip-reading and hand gestures to effectively
+assist communication among people with hearing impairments. The Automatic CS
+Recognition (ACSR) task aims to recognize CS videos into linguistic texts,
+which involves both lips and hands as two distinct modalities conveying
+complementary information. However, the traditional centralized training
+approach poses potential privacy risks due to the use of facial and gesture
+videos in CS data. To address this issue, we propose a new Federated Cued
+Speech Recognition (FedCSR) framework to train an ACSR model over the
+decentralized CS data without sharing private information. In particular, a
+mutual knowledge distillation method is proposed to maintain cross-modal
+semantic consistency of the Non-IID CS data, which ensures learning a unified
+feature space for both linguistic and visual information. On the server side, a
+globally shared linguistic model is trained to capture the long-term
+dependencies in the text sentences, which is aligned with the visual
+information from the local clients via visual-to-linguistic distillation. On
+the client side, the visual model of each client is trained with its own local
+data, assisted by linguistic-to-visual distillation treating the linguistic
+model as the teacher. To the best of our knowledge, this is the first approach
+to consider the federated ACSR task for privacy protection. Experimental
+results on the Chinese CS dataset with multiple cuers demonstrate that our
+approach outperforms both mainstream federated learning baselines and existing
+centralized state-of-the-art ACSR methods, achieving 9.7% performance
+improvement for character error rate (CER) and 15.0% for word error rate (WER).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Skeleton-based Action Recognition via Mutual Information
+  Estimation and Maximization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot skeleton-based action recognition aims to recognize actions of
+unseen categories after training on data of seen categories. The key is to
+build the connection between visual and semantic space from seen to unseen
+classes. Previous studies have primarily focused on encoding sequences into a
+singular feature vector, with subsequent mapping the features to an identical
+anchor point within the embedded space. Their performance is hindered by 1) the
+ignorance of the global visual/semantic distribution alignment, which results
+in a limitation to capture the true interdependence between the two spaces. 2)
+the negligence of temporal information since the frame-wise features with rich
+action clues are directly pooled into a single feature vector. We propose a new
+zero-shot skeleton-based action recognition method via mutual information (MI)
+estimation and maximization. Specifically, 1) we maximize the MI between visual
+and semantic space for distribution alignment; 2) we leverage the temporal
+information for estimating the MI by encouraging MI to increase as more frames
+are observed. Extensive experiments on three large-scale skeleton action
+datasets confirm the effectiveness of our method. Code:
+https://github.com/YujieOuO/SMIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Multi-scale <span class="highlight-title">Transformer</span> for High-Resolution Salient Object
+  Detection <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Deng, Pingping Zhang, Wei Liu, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) aims to identify and segment the most
+conspicuous objects in an image or video. As an important pre-processing step,
+it has many potential applications in multimedia and vision tasks. With the
+advance of imaging devices, SOD with high-resolution images is of great demand,
+recently. However, traditional SOD methods are largely limited to
+low-resolution images, making them difficult to adapt to the development of
+High-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no
+large enough datasets for training and evaluating. Besides, current HRSOD
+methods generally produce incomplete object regions and irregular object
+boundaries. To address above issues, in this work, we first propose a new
+HRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K
+resolution. As far as we know, it is the largest dataset for the HRSOD task,
+which will significantly help future works in training and evaluating models.
+Furthermore, to improve the HRSOD performance, we propose a novel Recurrent
+Multi-scale Transformer (RMFormer), which recurrently utilizes shared
+Transformers and multi-scale refinement architectures. Thus, high-resolution
+saliency maps can be generated with the guidance of lower-resolution
+predictions. Extensive experiments on both high-resolution and low-resolution
+benchmarks show the effectiveness and superiority of the proposed framework.
+The source code and dataset are released at:
+https://github.com/DrowsyMon/RMFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ACM MM2023. More modifications may be
+  performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-08-15T05:20:07.128922260Z">
+            2023-08-15 05:20:07 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`